Signed-off-by: Xie jiamei <xiejiamei@hygon.cn> (cherry picked from commit 9cf451dd6fdd13ec64780b1f56c84778f99449fb)
96 lines
3.9 KiB
Diff
96 lines
3.9 KiB
Diff
From 01b5cac929a3be361dd575bed6673c40a25a6d61 Mon Sep 17 00:00:00 2001
|
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Wed, 14 Aug 2024 14:37:30 +0800
|
|
Subject: [PATCH 08/10] x86: Use `Avoid_Non_Temporal_Memset` to control
|
|
non-temporal path
|
|
|
|
This is just a refactor and there should be no behavioral change from
|
|
this commit.
|
|
|
|
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
|
|
for controlling whether we use non-temporal memset rather than having
|
|
extra logic based on vendor.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
---
|
|
sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
|
|
sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
|
|
2 files changed, 23 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
|
index b4030776a7..c9f2297524 100644
|
|
--- a/sysdeps/x86/cpu-features.c
|
|
+++ b/sysdeps/x86/cpu-features.c
|
|
@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|
unsigned int stepping = 0;
|
|
enum cpu_features_kind kind;
|
|
|
|
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
|
|
+ as of writing this, we only have benchmarks indicatings it profitability
|
|
+ on Intel/AMD. */
|
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
|
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
|
+
|
|
cpu_features->cachesize_non_temporal_divisor = 4;
|
|
#if !HAS_CPUID
|
|
if (__get_cpuid_max (0, 0) == 0)
|
|
@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|
|
|
update_active (cpu_features);
|
|
|
|
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
|
|
+ hardware. */
|
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
|
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
|
+
|
|
if (family == 0x06)
|
|
{
|
|
model += extended_model;
|
|
@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
|
|
|
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
|
|
|
|
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
|
|
+ hardware. */
|
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
|
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
|
+
|
|
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
|
{
|
|
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
|
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
|
index de4584116f..d8288f0b0c 100644
|
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
|
@@ -1048,14 +1048,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
|
rep_movsb_threshold = 2112;
|
|
|
|
- /* Non-temporal stores are more performant on Intel and AMD hardware above
|
|
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
|
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
|
- && (cpu_features->basic.kind == arch_kind_intel
|
|
- || cpu_features->basic.kind == arch_kind_amd))
|
|
- memset_non_temporal_threshold = non_temporal_threshold;
|
|
-
|
|
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
|
cases slower than the vectorized path (and for some alignments,
|
|
it is really slow, check BZ #30994). */
|
|
@@ -1077,6 +1069,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
|
if (tunable_size != 0)
|
|
shared = tunable_size;
|
|
|
|
+ /* Non-temporal stores are more performant on some hardware above
|
|
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
|
|
+ Intel and AMD hardware. */
|
|
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
|
+ memset_non_temporal_threshold = non_temporal_threshold;
|
|
+
|
|
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
|
|
if (tunable_size > minimum_non_temporal_threshold
|
|
&& tunable_size <= maximum_non_temporal_threshold)
|
|
--
|
|
2.17.1
|
|
|