!1014 [sync] PR-1012: x86: Add support for Hygon processors

From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
This commit is contained in:
openeuler-ci-bot 2025-03-13 06:59:26 +00:00 committed by Gitee
commit d9f212c1d8
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
11 changed files with 1297 additions and 1 deletions

View File

@ -0,0 +1,210 @@
From 17f7ca193d60fefd6cc5e48aacd1ce9f7dd29862 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 14 Aug 2024 14:37:31 +0800
Subject: [PATCH 09/10] x86: Add `Avoid_STOSB` tunable to allow NT memset
without ERMS
The goal of this flag is to allow targets which don't prefer/have ERMS
to still access the non-temporal memset implementation.
There are 4 cases for tuning memset:
1) `Avoid_STOSB && Avoid_Non_Temporal_Memset`
- Memset with temporal stores
2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset`
- Memset with temporal/non-temporal stores. Non-temporal path
goes through `rep stosb` path. We accomplish this by setting
`x86_rep_stosb_threshold` to
`x86_memset_non_temporal_threshold`.
3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset`
- Memset with temporal stores/`rep stosb`
3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset`
- Memset with temporal stores/`rep stosb`/non-temporal stores.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 4 +++
sysdeps/x86/cpu-tunables.c | 2 ++
sysdeps/x86/dl-cacheinfo.h | 34 ++++++++++++++++---
...cpu-features-preferred_feature_index_1.def | 1 +
sysdeps/x86/tst-hwcap-tunables.c | 6 ++--
sysdeps/x86_64/multiarch/ifunc-memset.h | 18 +++++++---
6 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c9f2297524..287edc5b08 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1014,6 +1014,10 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
cpu_features->preferred[index_arch_I686] |= bit_arch_I686;
+ /* No ERMS, we want to avoid stosb for memset. */
+ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;
+
#if !HAS_CPUID
no_cpuid:
#endif
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index b8475730ea..a4bbf13080 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -214,6 +214,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Prefer_FSRM,
disable, 11);
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Avoid_STOSB,
+ disable, 11);
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features,
Slow_SSE4_2,
SSE4_2,
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index d8288f0b0c..5803bfcea8 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1096,18 +1096,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
long int, NULL);
+ /*
+ For memset, the non-temporal implementation is only accessed through the
+ stosb code. ie:
+ ```
+ if (size >= rep_stosb_thresh)
+ {
+ if (size >= non_temporal_thresh)
+ {
+ do_non_temporal ();
+ }
+ do_stosb ();
+ }
+ do_normal_vec_loop ();
+ ```
+ So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+ to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+ `rep stosb` will never be used.
+ */
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+ memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, SIZE_MAX);
+ /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the
+ final value of `x86_memset_non_temporal_threshold`. In some cases this can
+ be a matter of correctness. */
+ if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB))
+ rep_stosb_threshold
+ = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+ SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
- TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
- memset_non_temporal_threshold,
- minimum_non_temporal_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
- TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
- SIZE_MAX);
unsigned long int rep_movsb_stop_threshold;
/* Setting the upper bound of ERMS to the computed value of
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index aae1c85551..38a0c9226c 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
BIT (Avoid_Non_Temporal_Memset)
+BIT (Avoid_STOSB)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 94307283d7..1920f5057e 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,8 @@ static const struct test_t
/* Disable everything. */
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
- "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
+ "-Avoid_STOSB",
test_1,
array_length (test_1)
},
@@ -68,7 +69,8 @@ static const struct test_t
/* Same as before, but with some empty suboptions. */
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
- "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
+ "-Avoid_STOSB,-,",
test_1,
array_length (test_1)
}
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 5c5096ec5a..6b3b9a17a2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
+static inline int
+prefer_erms_nt_impl (const struct cpu_features *cpu_features)
+{
+ return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
+}
+
static inline void *
IFUNC_SELECTOR (void)
{
@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (avx512_unaligned_erms);
return OPTIMIZE (avx512_unaligned);
@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (evex_unaligned_erms);
return OPTIMIZE (evex_unaligned);
@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (avx2_unaligned_erms_rtm);
return OPTIMIZE (avx2_unaligned_rtm);
@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
Prefer_No_VZEROUPPER, !))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (avx2_unaligned_erms);
return OPTIMIZE (avx2_unaligned);
}
}
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
return OPTIMIZE (sse2_unaligned_erms);
return OPTIMIZE (sse2_unaligned);
--
2.17.1

View File

@ -0,0 +1,95 @@
From 01b5cac929a3be361dd575bed6673c40a25a6d61 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 14 Aug 2024 14:37:30 +0800
Subject: [PATCH 08/10] x86: Use `Avoid_Non_Temporal_Memset` to control
non-temporal path
This is just a refactor and there should be no behavioral change from
this commit.
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
for controlling whether we use non-temporal memset rather than having
extra logic based on vendor.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
2 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index b4030776a7..c9f2297524 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+ as of writing this, we only have benchmarks indicatings it profitability
+ on Intel/AMD. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features)
update_active (cpu_features);
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (family == 0x06)
{
model += extended_model;
@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
{
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index de4584116f..d8288f0b0c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,14 +1048,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores are more performant on Intel and AMD hardware above
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
- && (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd))
- memset_non_temporal_threshold = non_temporal_threshold;
-
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1077,6 +1069,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (tunable_size != 0)
shared = tunable_size;
+ /* Non-temporal stores are more performant on some hardware above
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+ Intel and AMD hardware. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
if (tunable_size > minimum_non_temporal_threshold
&& tunable_size <= maximum_non_temporal_threshold)
--
2.17.1

View File

@ -0,0 +1,97 @@
From daa15a5bffc436cf7b943b306c85c90ce8bb369e Mon Sep 17 00:00:00 2001
From: Feifei Wang <wangfeifei@hygon.cn>
Date: Mon, 19 Aug 2024 14:57:54 +0800
Subject: [PATCH 02/10] x86: Add cache information support for Hygon processors
Add hygon branch in dl_init_cacheinfo function to initialize
cache size variables for hygon processors. In the meanwhile,
add handle_hygon() function to get cache information.
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 60 ++++++++++++++++++++++++++++++++++++++
1 file changed, 60 insertions(+)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 7b5ed210ca..85c404dd26 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -567,6 +567,48 @@ handle_zhaoxin (int name)
return 0;
}
+static long int __attribute__ ((noinline))
+handle_hygon (int name)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ unsigned int count = 0x1;
+
+ if (name >= _SC_LEVEL3_CACHE_SIZE)
+ count = 0x3;
+ else if (name >= _SC_LEVEL2_CACHE_SIZE)
+ count = 0x2;
+ else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+ count = 0x0;
+
+ /* Use __cpuid__ '0x8000_001D' to compute cache details. */
+ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+ switch (name)
+ {
+ case _SC_LEVEL1_ICACHE_ASSOC:
+ case _SC_LEVEL1_DCACHE_ASSOC:
+ case _SC_LEVEL2_CACHE_ASSOC:
+ case _SC_LEVEL3_CACHE_ASSOC:
+ return ((ebx >> 22) & 0x3ff) + 1;
+ case _SC_LEVEL1_ICACHE_LINESIZE:
+ case _SC_LEVEL1_DCACHE_LINESIZE:
+ case _SC_LEVEL2_CACHE_LINESIZE:
+ case _SC_LEVEL3_CACHE_LINESIZE:
+ return (ebx & 0xfff) + 1;
+ case _SC_LEVEL1_ICACHE_SIZE:
+ case _SC_LEVEL1_DCACHE_SIZE:
+ case _SC_LEVEL2_CACHE_SIZE:
+ case _SC_LEVEL3_CACHE_SIZE:
+ return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+ default:
+ __builtin_unreachable ();
+ }
+ return -1;
+}
+
static void
get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
long int core)
@@ -890,6 +932,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
shared_per_thread = shared;
}
+ else if (cpu_features->basic.kind == arch_kind_hygon)
+ {
+ data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE);
+ shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE);
+ shared_per_thread = shared;
+
+ level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE);
+ level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE);
+ level1_dcache_size = data;
+ level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC);
+ level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE);
+ level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);;
+ level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC);
+ level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE);
+ level3_cache_size = shared;
+ level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC);
+ level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE);
+ }
cpu_features->level1_icache_size = level1_icache_size;
cpu_features->level1_icache_linesize = level1_icache_linesize;
--
2.17.1

View File

@ -0,0 +1,69 @@
From 3215d6157f5f94706aa5db6783838885a8a3c4f1 Mon Sep 17 00:00:00 2001
From: Feifei Wang <wangfeifei@hygon.cn>
Date: Mon, 19 Aug 2024 14:57:53 +0800
Subject: [PATCH 01/10] x86: Add new architecture type for Hygon processors
Add a new architecture type arch_kind_hygon to spilt Hygon branch
from AMD. This is to facilitate the Hygon processors to make settings
that are suitable for its own characteristics.
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 19 ++++++++++++++++---
sysdeps/x86/include/cpu-features.h | 1 +
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index f752ebd24d..c4dd85145e 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -851,9 +851,8 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
|= bit_arch_Avoid_Short_Distance_REP_MOVSB;
}
- /* This spells out "AuthenticAMD" or "HygonGenuine". */
- else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
- || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
+ /* This spells out "AuthenticAMD". */
+ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
{
unsigned int extended_model;
@@ -963,6 +962,20 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
}
}
}
+ /* This spells out "HygonGenuine". */
+ else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
+ {
+ unsigned int extended_model;
+
+ kind = arch_kind_hygon;
+
+ get_common_indices (cpu_features, &family, &model, &extended_model,
+ &stepping);
+
+ get_extended_indices (cpu_features);
+
+ update_active (cpu_features);
+ }
else
{
kind = arch_kind_other;
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index eb30d342a6..594feeb2f4 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -856,6 +856,7 @@ enum cpu_features_kind
arch_kind_intel,
arch_kind_amd,
arch_kind_zhaoxin,
+ arch_kind_hygon,
arch_kind_other
};
--
2.17.1

View File

@ -0,0 +1,211 @@
From 4ad2c9d04b76d7c4a42d80a82c022cd60b43b8b2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 24 May 2024 12:38:51 -0500
Subject: [PATCH 04/10] x86: Add seperate non-temporal tunable for memset
The tuning for non-temporal stores for memset vs memcpy is not always
the same. This includes both the exact value and whether non-temporal
stores are profitable at all for a given arch.
This patch add `x86_memset_non_temporal_threshold`. Currently we
disable non-temporal stores for non Intel vendors as the only
benchmarks showing its benefit have been on Intel hardware.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
manual/tunables.texi | 16 +++++++++++++++-
sysdeps/x86/cacheinfo.h | 8 +++++++-
sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
sysdeps/x86/dl-tunables.list | 3 +++
sysdeps/x86/include/cpu-features.h | 4 +++-
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
7 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 6493904bae..2a2877884c 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.cpu.x86_shstk:
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages.
@cindex shared_cache_size tunables
@cindex tunables, shared_cache_size
@cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
@deftp {Tunable namespace} glibc.cpu
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -562,6 +564,18 @@ like memmove and memcpy.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
set threshold in bytes to start using "rep movsb". The value must be
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index ec1bc142c4..fd2b2ae66b 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
-/* Threshold to use non temporal store. */
+/* Threshold to use non temporal store in memmove. */
long int __x86_shared_non_temporal_threshold attribute_hidden;
+/* Threshold to use non temporal store in memset. */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
/* Threshold to use Enhanced REP MOVSB. */
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
@@ -77,6 +80,9 @@ init_cacheinfo (void)
__x86_shared_non_temporal_threshold
= cpu_features->non_temporal_threshold;
+ __x86_memset_non_temporal_threshold
+ = cpu_features->memset_non_temporal_threshold;
+
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index ce2e6927e4..9f27da21ce 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,6 +1048,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
+ Until we benchmark data on other x86 processor, disable non-temporal
+ stores in memset. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (cpu_features->basic.kind == arch_kind_intel)
+ memset_non_temporal_threshold = non_temporal_threshold;
+
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1074,6 +1081,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
&& tunable_size <= maximum_non_temporal_threshold)
non_temporal_threshold = tunable_size;
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ if (tunable_size > minimum_non_temporal_threshold
+ && tunable_size <= maximum_non_temporal_threshold)
+ memset_non_temporal_threshold = tunable_size;
+
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
if (tunable_size > minimum_rep_movsb_threshold)
rep_movsb_threshold = tunable_size;
@@ -1089,6 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1102,6 +1117,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
cpu_features->non_temporal_threshold = non_temporal_threshold;
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 5aab63e532..05d54b5eba 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void)
cpu_features->shared_cache_size);
print_cpu_features_value ("non_temporal_threshold",
cpu_features->non_temporal_threshold);
+ print_cpu_features_value ("memset_non_temporal_threshold",
+ cpu_features->memset_non_temporal_threshold);
print_cpu_features_value ("rep_movsb_threshold",
cpu_features->rep_movsb_threshold);
print_cpu_features_value ("rep_movsb_stop_threshold",
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index d1442d88ba..53852d6a07 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,9 @@ glibc {
x86_non_temporal_threshold {
type: SIZE_T
}
+ x86_memset_non_temporal_threshold {
+ type: SIZE_T
+ }
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 594feeb2f4..e2d641dcd0 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -918,8 +918,10 @@ struct cpu_features
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
unsigned long int shared_cache_size;
- /* Threshold to use non temporal store. */
+ /* Threshold to use non temporal store in memmove. */
unsigned long int non_temporal_threshold;
+ /* Threshold to use non temporal store in memset. */
+ unsigned long int memset_non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to stop using "rep movsb". */
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index aba45e3da0..d95750b516 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -24,9 +24,9 @@
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done.
6. On machines ERMS feature, if size is range
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
then REP STOSB will be used.
- 7. If size >= __x86_shared_non_temporal_threshold, use a
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
non-temporal stores. */
#include <sysdep.h>
@@ -318,7 +318,7 @@ L(return_vzeroupper):
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
range for 2-byte jump encoding. */
L(stosb_local):
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
jae L(nt_memset)
movzbl %sil, %eax
mov %RDX_LP, %RCX_LP
--
2.17.1

View File

@ -0,0 +1,263 @@
From ce7c6c491ed0750a10f9a52b5edc710d978e70e2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 15 Jul 2024 16:19:17 +0800
Subject: [PATCH 07/10] x86: Disable non-temporal memset on Skylake Server
The original commit enabling non-temporal memset on Skylake Server had
erroneous benchmarks (actually done on ICX).
Further benchmarks indicate non-temporal stores may in fact by a
regression on Skylake Server.
This commit may be over-cautious in some cases, but should avoid any
regressions for 2.40.
Tested using qemu on all x86_64 cpu arch supported by both qemu +
GLIBC.
Reviewed-by: DJ Delorie <dj@redhat.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 13 +-
sysdeps/x86/cpu-tunables.c | 6 +
sysdeps/x86/dl-cacheinfo.h | 15 +-
...cpu-features-preferred_feature_index_1.def | 1 +
sysdeps/x86/tst-hwcap-tunables.c | 148 ++++++++++++++++++
5 files changed, 173 insertions(+), 10 deletions(-)
create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c4dd85145e..b4030776a7 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_COMETLAKE:
case INTEL_BIGCORE_SKYLAKE_AVX512:
case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due its it unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_ICELAKE:
case INTEL_BIGCORE_TIGERLAKE:
case INTEL_BIGCORE_ROCKETLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 0d4f328585..b8475730ea 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -272,6 +272,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
disable, 24);
}
break;
+ case 25:
+ {
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ Avoid_Non_Temporal_Memset,
+ disable, 25);
+ }
case 26:
{
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a76df092e6..de4584116f 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1051,13 +1051,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* Non-temporal stores are more performant on Intel and AMD hardware above
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd)
- memset_non_temporal_threshold = non_temporal_threshold;
-
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
- cases slower than the vectorized path (and for some alignments,
- it is really slow, check BZ #30994). */
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+ && (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
if (cpu_features->basic.kind == arch_kind_amd)
rep_movsb_threshold = non_temporal_threshold;
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index d20c5b3196..aae1c85551 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
new file mode 100644
index 0000000000..94307283d7
--- /dev/null
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -0,0 +1,148 @@
+/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
+ Copyright (C) 2023-2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <getopt.h>
+#include <ifunc-impl-list.h>
+#include <spawn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <intprops.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xunistd.h>
+#include <support/capture_subprocess.h>
+
+/* Nonzero if the program gets called via `exec'. */
+#define CMDLINE_OPTIONS \
+ { "restart", no_argument, &restart, 1 },
+static int restart;
+
+/* Disable everything. */
+static const char *test_1[] =
+{
+ "__memcpy_avx512_no_vzeroupper",
+ "__memcpy_avx512_unaligned",
+ "__memcpy_avx512_unaligned_erms",
+ "__memcpy_evex_unaligned",
+ "__memcpy_evex_unaligned_erms",
+ "__memcpy_avx_unaligned",
+ "__memcpy_avx_unaligned_erms",
+ "__memcpy_avx_unaligned_rtm",
+ "__memcpy_avx_unaligned_erms_rtm",
+ "__memcpy_ssse3",
+};
+
+static const struct test_t
+{
+ const char *env;
+ const char *const *funcs;
+ size_t nfuncs;
+} tests[] =
+{
+ {
+ /* Disable everything. */
+ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+ test_1,
+ array_length (test_1)
+ },
+ {
+ /* Same as before, but with some empty suboptions. */
+ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+ test_1,
+ array_length (test_1)
+ }
+};
+
+/* Called on process re-execution. */
+_Noreturn static void
+handle_restart (int ntest)
+{
+ struct libc_ifunc_impl impls[32];
+ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
+ if (cnt == 0)
+ _exit (EXIT_SUCCESS);
+ TEST_VERIFY_EXIT (cnt >= 1);
+ for (int i = 0; i < cnt; i++)
+ {
+ for (int f = 0; f < tests[ntest].nfuncs; f++)
+ {
+ if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
+ TEST_COMPARE (impls[i].usable, false);
+ }
+ }
+
+ _exit (EXIT_SUCCESS);
+}
+
+static int
+do_test (int argc, char *argv[])
+{
+ /* We must have either:
+ - One our fource parameters left if called initially:
+ + path to ld.so optional
+ + "--library-path" optional
+ + the library path optional
+ + the application name
+ + the test to check */
+
+ TEST_VERIFY_EXIT (argc == 2 || argc == 5);
+
+ if (restart)
+ handle_restart (atoi (argv[1]));
+
+ char nteststr[INT_BUFSIZE_BOUND (int)];
+
+ char *spargv[10];
+ {
+ int i = 0;
+ for (; i < argc - 1; i++)
+ spargv[i] = argv[i + 1];
+ spargv[i++] = (char *) "--direct";
+ spargv[i++] = (char *) "--restart";
+ spargv[i++] = nteststr;
+ spargv[i] = NULL;
+ }
+
+ for (int i = 0; i < array_length (tests); i++)
+ {
+ snprintf (nteststr, sizeof nteststr, "%d", i);
+
+ printf ("[%d] Spawned test for %s\n", i, tests[i].env);
+ char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
+ setenv ("GLIBC_TUNABLES", tunable, 1);
+
+ struct support_capture_subprocess result
+ = support_capture_subprogram (spargv[0], spargv, NULL);
+ support_capture_subprocess_check (&result, "tst-tunables", 0,
+ sc_allow_stderr);
+ support_capture_subprocess_free (&result);
+
+ free (tunable);
+ }
+
+ return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
--
2.17.1

View File

@ -0,0 +1,92 @@
From 1e57e1c6aa6ca5a476aba725271c1ace9be345d3 Mon Sep 17 00:00:00 2001
From: Feifei Wang <wangfeifei@hygon.cn>
Date: Mon, 19 Aug 2024 14:57:55 +0800
Subject: [PATCH 10/10] x86: Enable non-temporal memset for Hygon processors
This patch uses 'Avoid_Non_Temporal_Memset' flag to access
the non-temporal memset implementation for hygon processors.
Test Results:
hygon1 arch
x86_memset_non_temporal_threshold = 8MB
size new performance time / old performance time
1MB 0.994
4MB 0.996
8MB 0.670
16MB 0.343
32MB 0.355
hygon2 arch
x86_memset_non_temporal_threshold = 8MB
size new performance time / old performance time
1MB 1
4MB 1
8MB 1.312
16MB 0.822
32MB 0.830
hygon3 arch
x86_memset_non_temporal_threshold = 8MB
size new performance time / old performance time
1MB 1
4MB 0.990
8MB 0.737
16MB 0.390
32MB 0.401
For hygon arch with this patch, non-temporal stores can improve
performance by 20% - 65%.
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 9 +++++++--
sysdeps/x86/dl-cacheinfo.h | 2 +-
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 287edc5b08..f5539aea6f 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -640,9 +640,9 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
- /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+ /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
as of writing this, we only have benchmarks indicatings it profitability
- on Intel/AMD. */
+ on Intel/AMD/Hygon. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|= bit_arch_Avoid_Non_Temporal_Memset;
@@ -998,6 +998,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
get_extended_indices (cpu_features);
update_active (cpu_features);
+
+ /* Benchmarks indicate non-temporal memset can be profitable on Hygon
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
}
else
{
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5803bfcea8..d4dad8df3b 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* Non-temporal stores are more performant on some hardware above
non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
- Intel and AMD hardware. */
+ Intel, AMD and Hygon hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
memset_non_temporal_threshold = non_temporal_threshold;
--
2.17.1

View File

@ -0,0 +1,47 @@
From 54e99a96ec3b97f53ee018bfa8dbbef2dd13f1e8 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 7 Jun 2024 23:04:47 +0000
Subject: [PATCH 05/10] x86: Enable non-temporal memset tunable for AMD
In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
memset") a tunable threshold for enabling non-temporal memset was added,
but only for Intel hardware.
Since that commit, new benchmark results suggest that non-temporal
memset is beneficial on AMD, as well, so allow this tunable to be set
for AMD.
See:
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
which has been updated to include data using different stategies for
large memset on AMD Zen2, Zen3, and Zen4.
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 9f27da21ce..dfdb4069c7 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,11 +1048,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores in memset have only been tested on Intel hardware.
- Until we benchmark data on other x86 processor, disable non-temporal
- stores in memset. */
+ /* Non-temporal stores are more performant on Intel and AMD hardware above
+ non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel)
+ if (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd)
memset_non_temporal_threshold = non_temporal_threshold;
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
--
2.17.1

View File

@ -0,0 +1,149 @@
From f1ea6401d790764e4fcf02c6fb28e69841c25640 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:38 -0300
Subject: [PATCH 03/10] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
The REP MOVSB usage on memcpy/memmove does not show much performance
improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
as from BZ 30994, if the source is aligned and the destination is not
the performance can be 20x slower.
The performance difference is noticeable with small buffer sizes, closer
to the lower bounds limits when memcpy/memmove starts to use ERMS. The
performance of REP MOVSB is similar to vectorized instruction on the
size limit (the L2 cache). Also, there is no drawback to multiple cores
sharing the cache.
Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 85c404dd26..ce2e6927e4 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -833,7 +833,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
long int data = -1;
long int shared = -1;
long int shared_per_thread = -1;
- long int core = -1;
unsigned int threads = 0;
unsigned long int level1_icache_size = -1;
unsigned long int level1_icache_linesize = -1;
@@ -851,7 +850,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (cpu_features->basic.kind == arch_kind_intel)
{
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
shared_per_thread = shared;
@@ -864,7 +862,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
level1_dcache_linesize
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
- level2_cache_size = core;
+ level2_cache_size
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
level2_cache_assoc
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
level2_cache_linesize
@@ -877,12 +876,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level4_cache_size
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
{
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
shared_per_thread = shared;
@@ -891,19 +890,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_amd)
{
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -911,7 +910,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
@@ -922,12 +921,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (shared <= 0)
{
/* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ shared = level2_cache_size;
}
else if (cpu_features->basic.family < 0x17)
{
/* Account for exclusive L2 and L3 caches. */
- shared += core;
+ shared += level2_cache_size;
}
shared_per_thread = shared;
@@ -1049,6 +1048,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_threshold = non_temporal_threshold;
+
/* The default threshold to use Enhanced REP STOSB. */
unsigned long int rep_stosb_threshold = 2048;
@@ -1090,16 +1095,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
SIZE_MAX);
unsigned long int rep_movsb_stop_threshold;
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
- performing poorly for data above L2 cache size. Henceforth, adding
- an upper bound threshold parameter to limit the usage of Enhanced
- REP MOVSB operations and setting its value to L2 cache size. */
- if (cpu_features->basic.kind == arch_kind_amd)
- rep_movsb_stop_threshold = core;
/* Setting the upper bound of ERMS to the computed value of
- non-temporal threshold for architectures other than AMD. */
- else
- rep_movsb_stop_threshold = non_temporal_threshold;
+ non-temporal threshold for all architectures. */
+ rep_movsb_stop_threshold = non_temporal_threshold;
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
--
2.17.1

View File

@ -0,0 +1,41 @@
From 8a2cea0ae0cbd4120770b81f0be422f60f020e17 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 14 Jun 2024 13:01:58 -0500
Subject: [PATCH 06/10] x86: Fix value for `x86_memset_non_temporal_threshold`
when it is undesirable
When we don't want to use non-temporal stores for memset, we set
`x86_memset_non_temporal_threshold` to SIZE_MAX.
The current code, however, we using `maximum_non_temporal_threshold`
as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
value of `0`.
Fix is to just use `SIZE_MAX` as the upper bound for when setting the
tunable.
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index dfdb4069c7..a76df092e6 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1101,9 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
- TUNABLE_SET_WITH_BOUNDS (
- x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
- minimum_non_temporal_threshold, maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+ memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
--
2.17.1

View File

@ -67,7 +67,7 @@
############################################################################## ##############################################################################
Name: glibc Name: glibc
Version: 2.38 Version: 2.38
Release: 55 Release: 56
Summary: The GNU libc libraries Summary: The GNU libc libraries
License: %{all_license} License: %{all_license}
URL: http://www.gnu.org/software/glibc/ URL: http://www.gnu.org/software/glibc/
@ -268,6 +268,16 @@ Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
Patch181: backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch Patch181: backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch
Patch182: backport-x86-Add-new-architecture-type-for-Hygon-processors.patch
Patch183: backport-x86-Add-cache-information-support-for-Hygon-processo.patch
Patch184: backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
Patch185: backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
Patch186: backport-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
Patch187: backport-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
#openEuler patch list #openEuler patch list
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -1492,6 +1502,18 @@ fi
%endif %endif
%changelog %changelog
* Wed Mar 12 2025 xiajimei <xiejiamei@hygon.cn> - 2.38-56
- x86: Enable non-temporal memset for Hygon processors
- x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS
- x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
- x86: Disable non-temporal memset on Skylake Server
- x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable
- x86: Enable non-temporal memset tunable for AMD
- x86: Add seperate non-temporal tunable for memset
- x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
- x86: Add cache information support for Hygon processors
- x86: Add new architecture type for Hygon processors
* Sat Mar 08 2025 shixuantong <shixuantong1@huawei.com> - 2.38-55 * Sat Mar 08 2025 shixuantong <shixuantong1@huawei.com> - 2.38-55
- elf: Keep using minimal malloc after early DTV resize - elf: Keep using minimal malloc after early DTV resize