264 lines
8.9 KiB
Diff
264 lines
8.9 KiB
Diff
|
|
From ce7c6c491ed0750a10f9a52b5edc710d978e70e2 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
|
Date: Mon, 15 Jul 2024 16:19:17 +0800
|
||
|
|
Subject: [PATCH 07/10] x86: Disable non-temporal memset on Skylake Server
|
||
|
|
|
||
|
|
The original commit enabling non-temporal memset on Skylake Server had
|
||
|
|
erroneous benchmarks (actually done on ICX).
|
||
|
|
|
||
|
|
Further benchmarks indicate non-temporal stores may in fact by a
|
||
|
|
regression on Skylake Server.
|
||
|
|
|
||
|
|
This commit may be over-cautious in some cases, but should avoid any
|
||
|
|
regressions for 2.40.
|
||
|
|
|
||
|
|
Tested using qemu on all x86_64 cpu arch supported by both qemu +
|
||
|
|
GLIBC.
|
||
|
|
|
||
|
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||
|
|
---
|
||
|
|
sysdeps/x86/cpu-features.c | 13 +-
|
||
|
|
sysdeps/x86/cpu-tunables.c | 6 +
|
||
|
|
sysdeps/x86/dl-cacheinfo.h | 15 +-
|
||
|
|
...cpu-features-preferred_feature_index_1.def | 1 +
|
||
|
|
sysdeps/x86/tst-hwcap-tunables.c | 148 ++++++++++++++++++
|
||
|
|
5 files changed, 173 insertions(+), 10 deletions(-)
|
||
|
|
create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
|
||
|
|
|
||
|
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||
|
|
index c4dd85145e..b4030776a7 100644
|
||
|
|
--- a/sysdeps/x86/cpu-features.c
|
||
|
|
+++ b/sysdeps/x86/cpu-features.c
|
||
|
|
@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||
|
|
|
||
|
|
/* Newer Bigcore microarch (larger non-temporal store
|
||
|
|
threshold). */
|
||
|
|
- case INTEL_BIGCORE_SKYLAKE:
|
||
|
|
- case INTEL_BIGCORE_KABYLAKE:
|
||
|
|
- case INTEL_BIGCORE_COMETLAKE:
|
||
|
|
case INTEL_BIGCORE_SKYLAKE_AVX512:
|
||
|
|
case INTEL_BIGCORE_CANNONLAKE:
|
||
|
|
+ /* Benchmarks indicate non-temporal memset is not
|
||
|
|
+ necessarily profitable on SKX (and in some cases much
|
||
|
|
+ worse). This is likely unique to SKX due its it unique
|
||
|
|
+ mesh interconnect (not present on ICX or BWD). Disable
|
||
|
|
+ non-temporal on all Skylake servers. */
|
||
|
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||
|
|
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
||
|
|
+ case INTEL_BIGCORE_COMETLAKE:
|
||
|
|
+ case INTEL_BIGCORE_SKYLAKE:
|
||
|
|
+ case INTEL_BIGCORE_KABYLAKE:
|
||
|
|
case INTEL_BIGCORE_ICELAKE:
|
||
|
|
case INTEL_BIGCORE_TIGERLAKE:
|
||
|
|
case INTEL_BIGCORE_ROCKETLAKE:
|
||
|
|
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||
|
|
index 0d4f328585..b8475730ea 100644
|
||
|
|
--- a/sysdeps/x86/cpu-tunables.c
|
||
|
|
+++ b/sysdeps/x86/cpu-tunables.c
|
||
|
|
@@ -272,6 +272,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||
|
|
disable, 24);
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
+ case 25:
|
||
|
|
+ {
|
||
|
|
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||
|
|
+ Avoid_Non_Temporal_Memset,
|
||
|
|
+ disable, 25);
|
||
|
|
+ }
|
||
|
|
case 26:
|
||
|
|
{
|
||
|
|
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
||
|
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||
|
|
index a76df092e6..de4584116f 100644
|
||
|
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||
|
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||
|
|
@@ -1051,13 +1051,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||
|
|
/* Non-temporal stores are more performant on Intel and AMD hardware above
|
||
|
|
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||
|
|
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||
|
|
- if (cpu_features->basic.kind == arch_kind_intel
|
||
|
|
- || cpu_features->basic.kind == arch_kind_amd)
|
||
|
|
- memset_non_temporal_threshold = non_temporal_threshold;
|
||
|
|
-
|
||
|
|
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||
|
|
- cases slower than the vectorized path (and for some alignments,
|
||
|
|
- it is really slow, check BZ #30994). */
|
||
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
||
|
|
+ && (cpu_features->basic.kind == arch_kind_intel
|
||
|
|
+ || cpu_features->basic.kind == arch_kind_amd))
|
||
|
|
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||
|
|
+
|
||
|
|
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||
|
|
+ cases slower than the vectorized path (and for some alignments,
|
||
|
|
+ it is really slow, check BZ #30994). */
|
||
|
|
if (cpu_features->basic.kind == arch_kind_amd)
|
||
|
|
rep_movsb_threshold = non_temporal_threshold;
|
||
|
|
|
||
|
|
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||
|
|
index d20c5b3196..aae1c85551 100644
|
||
|
|
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||
|
|
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||
|
|
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
|
||
|
|
BIT (MathVec_Prefer_No_AVX512)
|
||
|
|
BIT (Prefer_FSRM)
|
||
|
|
BIT (Avoid_Short_Distance_REP_MOVSB)
|
||
|
|
+BIT (Avoid_Non_Temporal_Memset)
|
||
|
|
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
|
||
|
|
new file mode 100644
|
||
|
|
index 0000000000..94307283d7
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/x86/tst-hwcap-tunables.c
|
||
|
|
@@ -0,0 +1,148 @@
|
||
|
|
+/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
|
||
|
|
+ Copyright (C) 2023-2024 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library; if not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include <array_length.h>
|
||
|
|
+#include <getopt.h>
|
||
|
|
+#include <ifunc-impl-list.h>
|
||
|
|
+#include <spawn.h>
|
||
|
|
+#include <stdio.h>
|
||
|
|
+#include <stdlib.h>
|
||
|
|
+#include <string.h>
|
||
|
|
+#include <intprops.h>
|
||
|
|
+#include <support/check.h>
|
||
|
|
+#include <support/support.h>
|
||
|
|
+#include <support/xunistd.h>
|
||
|
|
+#include <support/capture_subprocess.h>
|
||
|
|
+
|
||
|
|
+/* Nonzero if the program gets called via `exec'. */
|
||
|
|
+#define CMDLINE_OPTIONS \
|
||
|
|
+ { "restart", no_argument, &restart, 1 },
|
||
|
|
+static int restart;
|
||
|
|
+
|
||
|
|
+/* Disable everything. */
|
||
|
|
+static const char *test_1[] =
|
||
|
|
+{
|
||
|
|
+ "__memcpy_avx512_no_vzeroupper",
|
||
|
|
+ "__memcpy_avx512_unaligned",
|
||
|
|
+ "__memcpy_avx512_unaligned_erms",
|
||
|
|
+ "__memcpy_evex_unaligned",
|
||
|
|
+ "__memcpy_evex_unaligned_erms",
|
||
|
|
+ "__memcpy_avx_unaligned",
|
||
|
|
+ "__memcpy_avx_unaligned_erms",
|
||
|
|
+ "__memcpy_avx_unaligned_rtm",
|
||
|
|
+ "__memcpy_avx_unaligned_erms_rtm",
|
||
|
|
+ "__memcpy_ssse3",
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+static const struct test_t
|
||
|
|
+{
|
||
|
|
+ const char *env;
|
||
|
|
+ const char *const *funcs;
|
||
|
|
+ size_t nfuncs;
|
||
|
|
+} tests[] =
|
||
|
|
+{
|
||
|
|
+ {
|
||
|
|
+ /* Disable everything. */
|
||
|
|
+ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||
|
|
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
|
||
|
|
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
|
||
|
|
+ test_1,
|
||
|
|
+ array_length (test_1)
|
||
|
|
+ },
|
||
|
|
+ {
|
||
|
|
+ /* Same as before, but with some empty suboptions. */
|
||
|
|
+ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||
|
|
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
|
||
|
|
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
|
||
|
|
+ test_1,
|
||
|
|
+ array_length (test_1)
|
||
|
|
+ }
|
||
|
|
+};
|
||
|
|
+
|
||
|
|
+/* Called on process re-execution. */
|
||
|
|
+_Noreturn static void
|
||
|
|
+handle_restart (int ntest)
|
||
|
|
+{
|
||
|
|
+ struct libc_ifunc_impl impls[32];
|
||
|
|
+ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
|
||
|
|
+ if (cnt == 0)
|
||
|
|
+ _exit (EXIT_SUCCESS);
|
||
|
|
+ TEST_VERIFY_EXIT (cnt >= 1);
|
||
|
|
+ for (int i = 0; i < cnt; i++)
|
||
|
|
+ {
|
||
|
|
+ for (int f = 0; f < tests[ntest].nfuncs; f++)
|
||
|
|
+ {
|
||
|
|
+ if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
|
||
|
|
+ TEST_COMPARE (impls[i].usable, false);
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ _exit (EXIT_SUCCESS);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static int
|
||
|
|
+do_test (int argc, char *argv[])
|
||
|
|
+{
|
||
|
|
+ /* We must have either:
|
||
|
|
+ - One our fource parameters left if called initially:
|
||
|
|
+ + path to ld.so optional
|
||
|
|
+ + "--library-path" optional
|
||
|
|
+ + the library path optional
|
||
|
|
+ + the application name
|
||
|
|
+ + the test to check */
|
||
|
|
+
|
||
|
|
+ TEST_VERIFY_EXIT (argc == 2 || argc == 5);
|
||
|
|
+
|
||
|
|
+ if (restart)
|
||
|
|
+ handle_restart (atoi (argv[1]));
|
||
|
|
+
|
||
|
|
+ char nteststr[INT_BUFSIZE_BOUND (int)];
|
||
|
|
+
|
||
|
|
+ char *spargv[10];
|
||
|
|
+ {
|
||
|
|
+ int i = 0;
|
||
|
|
+ for (; i < argc - 1; i++)
|
||
|
|
+ spargv[i] = argv[i + 1];
|
||
|
|
+ spargv[i++] = (char *) "--direct";
|
||
|
|
+ spargv[i++] = (char *) "--restart";
|
||
|
|
+ spargv[i++] = nteststr;
|
||
|
|
+ spargv[i] = NULL;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ for (int i = 0; i < array_length (tests); i++)
|
||
|
|
+ {
|
||
|
|
+ snprintf (nteststr, sizeof nteststr, "%d", i);
|
||
|
|
+
|
||
|
|
+ printf ("[%d] Spawned test for %s\n", i, tests[i].env);
|
||
|
|
+ char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
|
||
|
|
+ setenv ("GLIBC_TUNABLES", tunable, 1);
|
||
|
|
+
|
||
|
|
+ struct support_capture_subprocess result
|
||
|
|
+ = support_capture_subprogram (spargv[0], spargv, NULL);
|
||
|
|
+ support_capture_subprocess_check (&result, "tst-tunables", 0,
|
||
|
|
+ sc_allow_stderr);
|
||
|
|
+ support_capture_subprocess_free (&result);
|
||
|
|
+
|
||
|
|
+ free (tunable);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+#define TEST_FUNCTION_ARGV do_test
|
||
|
|
+#include <support/test-driver.c>
|
||
|
|
--
|
||
|
|
2.17.1
|
||
|
|
|