!979 [sync] PR-978: x86: Set preferred CPU features and default NT threshold for Zhaoxin processors

From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229
2025-01-20 06:17:40 +00:00 · 2025-01-20 06:17:40 +00:00 · 8b46cd9a47
commit 8b46cd9a47
parent 493e77c570 3225aed4dd
4 changed files with 237 additions and 1 deletions
--- a/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
+++ b/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
@ -0,0 +1,102 @@
+From 5f5b877974cecf892346ae534edc4db9e8fbc75b Mon Sep 17 00:00:00 2001
+From: May <mayshao-oc@zhaoxin.com>
+Date: Wed, 15 Jan 2025 10:25:48 +0800
+Subject: [PATCH 1/3] x86:Set preferred CPU features on the KH-40000 and
+ KX-7000 Zhaoxin processors
+
+Fix code formatting under the Zhaoxin branch and add comments for
+different Zhaoxin models.
+
+Unaligned AVX load are slower on KH-40000 and KX-7000, so disable
+the AVX_Fast_Unaligned_Load.
+
+Enable Prefer_No_VZEROUPPER and Fast_Unaligned_Load features to
+use sse2_unaligned version of memset,strcpy and strcat.
+
+Signed-off-by: May <mayshao-oc@zhaoxin.com>
+---
+ sysdeps/x86/cpu-features.c | 51 ++++++++++++++++++++++++++------------
+ 1 file changed, 35 insertions(+), 16 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index badf0888..43b5f562 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -907,39 +907,58 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+       model += extended_model;
+       if (family == 0x6)
+-        {
+-          if (model == 0xf || model == 0x19)
+-            {
+	{
+	  /* Tuning for older Zhaoxin processors.  */
+	  if (model == 0xf || model == 0x19)
+	    {
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX);
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
+ 
+-              cpu_features->preferred[index_arch_Slow_SSE4_2]
+-                |= bit_arch_Slow_SSE4_2;
+	      cpu_features->preferred[index_arch_Slow_SSE4_2]
+		  |= bit_arch_Slow_SSE4_2;
+ 
+	      /*  Unaligned AVX loads are slower.  */
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+-            }
+-        }
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	    }
+	}
+       else if (family == 0x7)
+-        {
+-	  if (model == 0x1b)
+	{
+	  switch (model)
+ 	    {
+	      /* Wudaokou microarch tuning.  */
+	    case 0x1b:
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX);
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
+ 
+ 	      cpu_features->preferred[index_arch_Slow_SSE4_2]
+-		|= bit_arch_Slow_SSE4_2;
+		  |= bit_arch_Slow_SSE4_2;
+ 
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+-	    }
+-	  else if (model == 0x3b)
+-	    {
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	      break;
+
+	      /* Lujiazui microarch tuning.  */
+	    case 0x3b:
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX);
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
+ 
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+	      break;
+
+	      /* Yongfeng and Shijidadao mircoarch tuning.  */
+	    case 0x5b:
+	    case 0x6b:
+	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+
+	      /* To use sse2_unaligned versions of memset, strcpy and strcat.
+	       */
+	      cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+		  |= (bit_arch_Prefer_No_VZEROUPPER
+		      | bit_arch_Fast_Unaligned_Load);
+	      break;
+ 	    }
+ 	}
+     }
+-- 
+2.27.0
+
--- a/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
+++ b/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
@ -0,0 +1,77 @@
+From 59b19d50bd70c08e5c9f5db1742600b7b76df94a Mon Sep 17 00:00:00 2001
+From: May <mayshao-oc@zhaoxin.com>
+Date: Wed, 15 Jan 2025 10:32:17 +0800
+Subject: [PATCH 2/3] x86_64: Optimize large size copy in memmove-ssse3
+
+This patch optimizes large size copy using normal store when src > dst
+and overlap.  Make it the same as the logic in memmove-vec-unaligned-erms.S.
+
+Current memmove-ssse3 use '__x86_shared_cache_size_half' as the non-
+temporal threshold, this patch updates that value to
+'__x86_shared_non_temporal_threshold'.  Currently, the
+__x86_shared_non_temporal_threshold is cpu-specific, and different CPUs
+will have different values based on the related nt-benchmark results.
+However, in memmove-ssse3, the nontemporal threshold uses
+'__x86_shared_cache_size_half', which sounds unreasonable.
+
+The performance is not changed drastically although shows overall
+improvements without any major regressions or gains.
+
+Results on Zhaoxin KX-7000:
+
+bench-memcpy geometric_mean(N=20) New / Original: 0.999
+bench-memcpy-random geometric_mean(N=20) New / Original: 0.999
+bench-memcpy-large geometric_mean(N=20) New / Original: 0.978
+bench-memmove geometric_mean(N=20) New / Original: 1.000
+bench-memmmove-large geometric_mean(N=20) New / Original: 0.962
+
+Results on Intel Core i5-6600K:
+
+bench-memcpy geometric_mean(N=20) New / Original: 1.001
+bench-memcpy-random geometric_mean(N=20) New / Original: 0.999
+bench-memcpy-large geometric_mean(N=20) New / Original: 1.001
+bench-memmove geometric_mean(N=20) New / Original: 0.995
+bench-memmmove-large geometric_mean(N=20) New / Original: 0.936
+
+Signed-off-by: May <mayshao-oc@zhaoxin.com>
+---
+ sysdeps/x86_64/multiarch/memmove-ssse3.S | 14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
+index 460b0ec0..69561628 100644
+--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
+@@ -151,13 +151,10 @@ L(more_2x_vec):
+ 	   loop.  */
+ 	movups	%xmm0, (%rdi)
+ 
+-# ifdef SHARED_CACHE_SIZE_HALF
+-	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+-# else
+-	cmp	__x86_shared_cache_size_half(%rip), %rdx
+-# endif
+	cmp	__x86_shared_non_temporal_threshold(%rip), %rdx
+ 	ja	L(large_memcpy)
+ 
+L(loop_fwd):
+ 	leaq	-64(%rdi, %rdx), %r8
+ 	andq	$-16, %rdi
+ 	movl	$48, %edx
+@@ -199,6 +196,13 @@ L(large_memcpy):
+ 	movups	-64(%r9, %rdx), %xmm10
+ 	movups	-80(%r9, %rdx), %xmm11
+ 
+	/* Check if src and dst overlap. If they do use cacheable
+	   writes to potentially gain positive interference between
+	   the loads during the memmove.  */
+	subq	%rdi, %r9
+	cmpq	%rdx, %r9
+	jb	L(loop_fwd)
+
+ 	sall	$5, %ecx
+ 	leal	(%rcx, %rcx, 2), %r8d
+ 	leaq	-96(%rdi, %rdx), %rcx
+-- 
+2.27.0
+
--- a/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
+++ b/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
@ -0,0 +1,50 @@
+From 6374dceef2bf72456144184cb70bdd216a744d61 Mon Sep 17 00:00:00 2001
+From: May <mayshao-oc@zhaoxin.com>
+Date: Wed, 15 Jan 2025 10:42:33 +0800
+Subject: [PATCH 3/3] x86: Set default non_temporal_threshold for Zhaoxin
+ processors
+
+Current 'non_temporal_threshold' set to 'non_temporal_threshold_lowbound'
+on Zhaoxin processors without ERMS. The default
+'non_temporal_threshold_lowbound' is too small for the KH-40000 and KX-7000
+Zhaoxin processors, this patch updates the value to
+'shared / cachesize_non_temporal_divisor'.
+
+Signed-off-by: May <mayshao-oc@zhaoxin.com>
+---
+ sysdeps/x86/cpu-features.c | 1 +
+ sysdeps/x86/dl-cacheinfo.h | 6 ++++--
+ 2 files changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 43b5f562..f752ebd2 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -949,6 +949,7 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+ 	      /* Yongfeng and Shijidadao mircoarch tuning.  */
+ 	    case 0x5b:
+	      cpu_features->cachesize_non_temporal_divisor = 2;
+ 	    case 0x6b:
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+ 		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 6c774042..bd2a9122 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -940,8 +940,10 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+      a higher risk of actually thrashing the cache as they don't have a HW LRU
+      hint. As well, their performance in highly parallel situations is
+-     noticeably worse.  */
+-  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+     noticeably worse. Zhaoxin processors are an exception, the lowbound is not
+     suitable for them based on actual test data.  */
+  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+      && cpu_features->basic.kind != arch_kind_zhaoxin)
+     non_temporal_threshold = non_temporal_threshold_lowbound;
+   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+-- 
+2.27.0
+
--- a/glibc.spec
+++ b/glibc.spec
@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	50
+Release: 	51
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@ -291,6 +291,10 @@ Patch9031: 0021-Sw64-Add-test_numdouble.h-and-test_numfloat.h.patch
 Patch9032: 0022-Sw64-Fix-posix-tst-glob_lstat_compat-on-sw64.patch
 Patch9033: 0023-Sw64-add-getopt-weak-alias.patch

+Patch9034: 0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
+Patch9035: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
+Patch9036: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
+
 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)

 BuildRequires: audit-libs-devel >= 1.1.3, sed >= 3.95, libcap-devel, gettext
@ -1467,6 +1471,9 @@ fi
 %endif

 %changelog
+* Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51
+- x86: Set preferred CPU features and default NT threshold for Zhaoxin processors
+
 * Tue Jan 07 2025 Peng Fan <fanpeng@loongson.cn> - 2.38-50
 - LoongArch: Force SHMLBA the same as kernel