!1022 [sync] PR-1021: sync from glibc upstream 2.38 branch

From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229
sync from glibc upstream 2.38 branch.
2025-03-28 12:24:20 +00:00 · 2025-03-28 17:56:22 +08:00 · 2025-03-13 06:59:26 +00:00 · 2025-03-13 14:58:58 +08:00 · 2025-03-08 02:00:56 +00:00 · 2025-03-08 09:59:14 +08:00
21 changed files with 2467 additions and 2 deletions
--- a/AArch64-Add-SVE-memset.patch
+++ b/AArch64-Add-SVE-memset.patch
@ -0,0 +1,200 @@
+From 52c2b1556f773d9a75d030160e0e273a5ea84502 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Tue, 24 Dec 2024 18:01:59 +0000
+Subject: [PATCH] AArch64: Add SVE memset
+
+Add SVE memset based on the generic memset with predicated load for sizes < 16.
+Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
+stores for the last 64 bytes.  Performance of random memset benchmark improves
+by ~2% on Neoverse V1.
+
+Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
+---
+ sysdeps/aarch64/multiarch/Makefile           |   1 +
+ sysdeps/aarch64/multiarch/ifunc-impl-list.c  |   3 +-
+ sysdeps/aarch64/multiarch/memset.c           |   4 +
+ sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 +++++++++++++++++++
+ 4 files changed, 130 insertions(+), 1 deletion(-)
+ create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S
+
+diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
+index e4720b7468..214b6137b0 100644
+--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
+@@ -14,6 +14,7 @@ sysdep_routines += \
+   memset_generic \
+   memset_kunpeng \
+   memset_mops \
+  memset_sve_zva64 \
+   memset_zva64 \
+   strlen_asimd \
+   strlen_generic \
+diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+index 73038ac810..2fa6baa319 100644
+--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+@@ -56,7 +56,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
+ 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
+ #if HAVE_AARCH64_SVE_ASM
+-	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
+	      IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
+	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
+ #endif
+ 	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
+ 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
+index 6deb6865e5..89fde57f42 100644
+--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
+@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
+ 
+ static inline __typeof (__redirect_memset) *
+ select_memset_ifunc (void)
+@@ -47,6 +48,9 @@ select_memset_ifunc (void)
+     {
+       if (IS_A64FX (midr) && zva_size == 256)
+ 	return __memset_a64fx;
+
+      if (zva_size == 64)
+	return __memset_sve_zva64;
+     }
+ 
+   if (IS_KUNPENG920 (midr))
+diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
+new file mode 100644
+index 0000000000..7fb40fdd9e
+--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
+@@ -0,0 +1,123 @@
+/* Optimized memset for SVE.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ * ZVA size is 64.
+ */
+
+#if HAVE_AARCH64_SVE_ASM
+
+.arch armv8.2-a+sve
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define vlen	x5
+#define off	x3
+#define dstend2 x5
+
+ENTRY (__memset_sve_zva64)
+	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_16)
+
+	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
+
+	.p2align 4
+L(set_16):
+	whilelo p0.b, xzr, count
+	st1b	z0.b, p0, [dstin]
+	ret
+
+	.p2align 4
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	cmp	count, 256
+	b.lo	L(no_zva)
+	tst	valw, 255
+	b.ne	L(no_zva)
+
+	str	q0, [dstin]
+	str	q0, [dst, 16]
+	bic	dst, dstin, 31
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
+	bic	x8, x8, 15
+	stp	q0, q0, [x8, -48]
+	str	q0, [x8, -16]
+	str	q0, [dstend, -16]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
+	ret
+
+L(no_zva):
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 16]
+	stp	q0, q0, [dst, 48]
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (__memset_sve_zva64)
+#endif
+-- 
+2.27.0
+
--- a/AArch64-Improve-generic-strlen.patch
+++ b/AArch64-Improve-generic-strlen.patch
@ -0,0 +1,92 @@
+From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Wed, 7 Aug 2024 14:43:47 +0100
+Subject: [PATCH] AArch64: Improve generic strlen
+
+Improve performance by handling another 16 bytes before entering the loop.
+Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
+size computation to avoid increasing latency.  On Neoverse V1 performance
+of the random strlen benchmark improves by 4.6%.
+
+Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
+---
+ sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------
+ 1 file changed, 27 insertions(+), 12 deletions(-)
+
+diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
+index 133ef93342..352fb40d3a 100644
+--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
+@@ -1,4 +1,5 @@
+-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -56,36 +57,50 @@ ENTRY (STRLEN)
+ 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+ 	fmov	synd, dend
+ 	lsr	synd, synd, shift
+-	cbz	synd, L(loop)
+	cbz	synd, L(next16)
+ 
+ 	rbit	synd, synd
+ 	clz	result, synd
+ 	lsr	result, result, 2
+ 	ret
+ 
+L(next16):
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	src, src, 16
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	result, src, srcin
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
+ 	.p2align 5
+ L(loop):
+-	ldr	data, [src, 16]
+	ldr	data, [src, 32]!
+ 	cmeq	vhas_nul.16b, vdata.16b, 0
+-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
+ 	fmov	synd, dend
+ 	cbnz	synd, L(loop_end)
+-	ldr	data, [src, 32]!
+	ldr	data, [src, 16]
+ 	cmeq	vhas_nul.16b, vdata.16b, 0
+-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
+ 	fmov	synd, dend
+ 	cbz	synd, L(loop)
+-	sub	src, src, 16
+	add	src, src, 16
+ L(loop_end):
+-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+-	sub	result, src, srcin
+-	fmov	synd, dend
+	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
+ #ifndef __AARCH64EB__
+ 	rbit	synd, synd
+	sub	result, result, 3
+ #endif
+-	add	result, result, 16
+ 	clz	tmp, synd
+-	add	result, result, tmp, lsr 2
+	sub	result, tmp, result
+	lsr	result, result, 2
+ 	ret
+ 
+ END (STRLEN)
+-- 
+2.27.0
+
--- a/AArch64-Optimize-memset.patch
+++ b/AArch64-Optimize-memset.patch
@ -0,0 +1,287 @@
+From 95aa21432ccbf77225abd485d98df36ba760ff80 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Mon, 9 Sep 2024 15:26:47 +0100
+Subject: [PATCH] AArch64: Optimize memset
+
+Improve small memsets by avoiding branches and use overlapping stores.
+Use DC ZVA for copies over 128 bytes.  Remove unnecessary code for ZVA sizes
+other than 64 and 128.  Performance of random memset benchmark improves by 24%
+on Neoverse N1.
+
+Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
+---
+ sysdeps/aarch64/memset.S | 195 +++++++++++++++++----------------------
+ 1 file changed, 84 insertions(+), 111 deletions(-)
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index bbfb7184c3..caafb019e2 100644
+--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
+@@ -1,4 +1,5 @@
+-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -17,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include "memset-reg.h"
+ 
+ #ifndef MEMSET
+ # define MEMSET memset
+@@ -25,130 +25,132 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+  *
+  */
+ 
+-ENTRY (MEMSET)
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define off	x3
+#define dstend2	x5
+ 
+ENTRY (MEMSET)
+ 	PTR_ARG (0)
+ 	SIZE_ARG (2)
+ 
+ 	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_small)
+
+ 	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+ 
+-	cmp	count, 96
+-	b.hi	L(set_long)
+-	cmp	count, 16
+-	b.hs	L(set_medium)
+-	mov	val, v0.D[0]
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
+ 
+	.p2align 4
+ 	/* Set 0..15 bytes.  */
+-	tbz	count, 3, 1f
+-	str	val, [dstin]
+-	str	val, [dstend, -8]
+-	ret
+-	nop
+-1:	tbz	count, 2, 2f
+-	str	valw, [dstin]
+-	str	valw, [dstend, -4]
+L(set_small):
+	add	dstend, dstin, count
+	cmp	count, 4
+	b.lo	2f
+	lsr	off, count, 3
+	sub	dstend2, dstend, off, lsl 2
+	str	s0, [dstin]
+	str	s0, [dstin, off, lsl 2]
+	str	s0, [dstend2, -4]
+	str	s0, [dstend, -4]
+ 	ret
+
+	/* Set 0..3 bytes.  */
+ 2:	cbz	count, 3f
+	lsr	off, count, 1
+ 	strb	valw, [dstin]
+-	tbz	count, 1, 3f
+-	strh	valw, [dstend, -2]
+	strb	valw, [dstin, off]
+	strb	valw, [dstend, -1]
+ 3:	ret
+ 
+-	/* Set 17..96 bytes.  */
+-L(set_medium):
+-	str	q0, [dstin]
+-	tbnz	count, 6, L(set96)
+-	str	q0, [dstend, -16]
+-	tbz	count, 5, 1f
+-	str	q0, [dstin, 16]
+-	str	q0, [dstend, -32]
+-1:	ret
+-
+ 	.p2align 4
+-	/* Set 64..96 bytes.  Write 64 bytes from the start and
+-	   32 bytes from the end.  */
+-L(set96):
+-	str	q0, [dstin, 16]
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
+ 	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+-	.p2align 3
+-	nop
+	.p2align 4
+ L(set_long):
+-	and	valw, valw, 255
+-	bic	dst, dstin, 15
+ 	str	q0, [dstin]
+-	cmp	count, 256
+-	ccmp	valw, 0, 0, cs
+-	b.eq	L(try_zva)
+-L(no_zva):
+-	sub	count, dstend, dst	/* Count is 16 too large.  */
+-	sub	dst, dst, 16		/* Dst is biased by -32.  */
+-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+-1:	stp	q0, q0, [dst, 32]
+-	stp	q0, q0, [dst, 64]!
+-L(tail64):
+-	subs	count, count, 64
+-	b.hi	1b
+-2:	stp	q0, q0, [dstend, -64]
+	str	q0, [dst, 16]
+	tst	valw, 255
+	b.ne	L(no_zva)
+#ifndef ZVA64_ONLY
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(zva_128)
+#endif
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+
+	/* Write last bytes before ZVA loop.  */
+	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
+ 	ret
+ 
+-L(try_zva):
+-#ifndef ZVA64_ONLY
+ 	.p2align 3
+-	mrs	tmp1, dczid_el0
+-	tbnz	tmp1w, 4, L(no_zva)
+-	and	tmp1w, tmp1w, 15
+-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+-	b.ne	 L(zva_128)
+-	nop
+-#endif
+-	/* Write the first and last 64 byte aligned block using stp rather
+-	   than using DC ZVA.  This is faster on some cores.
+-	 */
+-	.p2align 4
+-L(zva_64):
+-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 32 too large.  */
+	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+ 	stp	q0, q0, [dst, 32]
+-	bic	dst, dst, 63
+ 	stp	q0, q0, [dst, 64]
+-	stp	q0, q0, [dst, 96]
+-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+-	add	dst, dst, 128
+-1:	dc	zva, dst
+ 	add	dst, dst, 64
+ 	subs	count, count, 64
+-	b.hi	1b
+-	stp	q0, q0, [dst, 0]
+-	stp	q0, q0, [dst, 32]
+	b.hi	L(no_zva_loop)
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+ #ifndef ZVA64_ONLY
+-	.p2align 3
+	.p2align 4
+ L(zva_128):
+-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+-	b.ne	L(zva_other)
+	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+	b.ne	L(no_zva)
+ 
+-	str	q0, [dst, 16]
+ 	stp	q0, q0, [dst, 32]
+ 	stp	q0, q0, [dst, 64]
+ 	stp	q0, q0, [dst, 96]
+ 	bic	dst, dst, 127
+ 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+-	add	dst, dst, 128
+-1:	dc	zva, dst
+-	add	dst, dst, 128
+	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+1:	add	dst, dst, 128
+	dc	zva, dst
+ 	subs	count, count, 128
+ 	b.hi	1b
+ 	stp	q0, q0, [dstend, -128]
+@@ -156,35 +158,6 @@ L(zva_128):
+ 	stp	q0, q0, [dstend, -64]
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+-
+-L(zva_other):
+-	mov	tmp2w, 4
+-	lsl	zva_lenw, tmp2w, tmp1w
+-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+-	cmp	count, tmp1
+-	blo	L(no_zva)
+-
+-	sub	tmp2, zva_len, 1
+-	add	tmp1, dst, zva_len
+-	add	dst, dst, 16
+-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+-	beq	2f
+-1:	stp	q0, q0, [dst], 64
+-	stp	q0, q0, [dst, -32]
+-	subs	count, count, 64
+-	b.hi	1b
+-2:	mov	dst, tmp1
+-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+-	subs	count, count, zva_len
+-	b.lo	4f
+-3:	dc	zva, dst
+-	add	dst, dst, zva_len
+-	subs	count, count, zva_len
+-	b.hs	3b
+-4:	add	count, count, zva_len
+-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
+-	b	L(tail64)
+ #endif
+ 
+ END (MEMSET)
+-- 
+2.27.0
+
--- a/AArch64-Remove-zva_128-from-memset.patch
+++ b/AArch64-Remove-zva_128-from-memset.patch
@ -0,0 +1,65 @@
+From 5fe151d86a19bc3dc791fd2d92efeb6c6e11cf64 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Mon, 25 Nov 2024 18:43:08 +0000
+Subject: [PATCH] AArch64: Remove zva_128 from memset
+
+Remove ZVA 128 support from memset - the new memset no longer
+guarantees count >= 256, which can result in underflow and a
+crash if ZVA size is 128 ([1]).  Since only one CPU uses a ZVA
+size of 128 and its memcpy implementation was removed in commit
+e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
+case too.
+
+[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
+
+Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
+(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
+---
+ sysdeps/aarch64/memset.S | 25 +------------------------
+ 1 file changed, 1 insertion(+), 24 deletions(-)
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index caafb019e2..71814d0b2f 100644
+--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
+@@ -104,7 +104,7 @@ L(set_long):
+ 	mrs	zva_val, dczid_el0
+ 	and	zva_val, zva_val, 31
+ 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+-	b.ne	L(zva_128)
+	b.ne	L(no_zva)
+ #endif
+ 	stp	q0, q0, [dst, 32]
+ 	bic	dst, dstin, 63
+@@ -137,28 +137,5 @@ L(no_zva_loop):
+ 	stp	q0, q0, [dstend, -32]
+ 	ret
+ 
+-#ifndef ZVA64_ONLY
+-	.p2align 4
+-L(zva_128):
+-	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+-	b.ne	L(no_zva)
+-
+-	stp	q0, q0, [dst, 32]
+-	stp	q0, q0, [dst, 64]
+-	stp	q0, q0, [dst, 96]
+-	bic	dst, dst, 127
+-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+-	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+-1:	add	dst, dst, 128
+-	dc	zva, dst
+-	subs	count, count, 128
+-	b.hi	1b
+-	stp	q0, q0, [dstend, -128]
+-	stp	q0, q0, [dstend, -96]
+-	stp	q0, q0, [dstend, -64]
+-	stp	q0, q0, [dstend, -32]
+-	ret
+-#endif
+-
+ END (MEMSET)
+ libc_hidden_builtin_def (MEMSET)
+-- 
+2.27.0
+
--- a/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
+++ b/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
@ -0,0 +1,29 @@
+From 097299ffa904b327fce83770fa6a522e4393ddb3 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Thu, 27 Feb 2025 16:28:52 +0000
+Subject: [PATCH] AArch64: Use prefer_sve_ifuncs for SVE memset
+
+Use prefer_sve_ifuncs for SVE memset just like memcpy.
+
+Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
+---
+ sysdeps/aarch64/multiarch/memset.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
+index 89fde57f42..ce5d35a20e 100644
+--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
+@@ -49,7 +49,7 @@ select_memset_ifunc (void)
+       if (IS_A64FX (midr) && zva_size == 256)
+ 	return __memset_a64fx;
+ 
+-      if (zva_size == 64)
+      if (prefer_sve_ifuncs && zva_size == 64)
+ 	return __memset_sve_zva64;
+     }
+ 
+-- 
+2.27.0
+
--- a/assert-Add-test-for-CVE-2025-0395.patch
+++ b/assert-Add-test-for-CVE-2025-0395.patch
@ -0,0 +1,132 @@
+From f984e2d7e8299726891a1a497a3c36cd5542a0bf Mon Sep 17 00:00:00 2001
+From: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Date: Fri, 31 Jan 2025 12:16:30 -0500
+Subject: [PATCH] assert: Add test for CVE-2025-0395
+
+Use the __progname symbol to override the program name to induce the
+failure that CVE-2025-0395 describes.
+
+This is related to BZ #32582
+
+Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
+---
+ assert/Makefile                  |  1 +
+ assert/tst-assert-sa-2025-0001.c | 92 ++++++++++++++++++++++++++++++++
+ 2 files changed, 93 insertions(+)
+ create mode 100644 assert/tst-assert-sa-2025-0001.c
+
+diff --git a/assert/Makefile b/assert/Makefile
+index 67f4e6a570..b0fc9fc4d2 100644
+--- a/assert/Makefile
+++ b/assert/Makefile
+@@ -38,6 +38,7 @@ tests := \
+   test-assert-perr \
+   tst-assert-c++ \
+   tst-assert-g++ \
+  tst-assert-sa-2025-0001 \
+   # tests
+ 
+ ifeq ($(have-cxx-thread_local),yes)
+diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
+new file mode 100644
+index 0000000000..102cb0078d
+--- /dev/null
+++ b/assert/tst-assert-sa-2025-0001.c
+@@ -0,0 +1,92 @@
+/* Test for CVE-2025-0395.
+   Copyright The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Test that a large enough __progname does not result in a buffer overflow
+   when printing an assertion failure.  This was CVE-2025-0395.  */
+#include <assert.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xstdio.h>
+#include <support/xunistd.h>
+
+extern const char *__progname;
+
+int
+do_test (int argc, char **argv)
+{
+
+  support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
+  ignore_stderr ();
+
+  /* XXX assumes that the assert is on a 2 digit line number.  */
+  const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
+
+  int ret = fprintf (stderr, prompt, __FILE__);
+  if (ret < 0)
+    FAIL_EXIT1 ("fprintf failed: %m\n");
+
+  size_t pagesize = getpagesize ();
+  size_t namesize = pagesize - 1 - ret;
+
+  /* Alter the progname so that the assert message fills the entire page.  */
+  char progname[namesize];
+  memset (progname, 'A', namesize - 1);
+  progname[namesize - 1] = '\0';
+  __progname = progname;
+
+  FILE *f = xfopen ("/proc/self/maps", "r");
+  char *line = NULL;
+  size_t len = 0;
+  uintptr_t prev_to = 0;
+
+  /* Pad the beginning of every writable mapping with a PROT_NONE map.  This
+     ensures that the mmap in the assert_fail path never ends up below a
+     writable map and will terminate immediately in case of a buffer
+     overflow.  */
+  while (xgetline (&line, &len, f))
+    {
+      uintptr_t from, to;
+      char perm[4];
+
+      sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
+	      &from, &to,
+	      &perm[0], &perm[1], &perm[2], &perm[3]);
+
+      bool writable = (memchr (perm, 'w', 4) != NULL);
+
+      if (prev_to != 0 && from - prev_to > pagesize && writable)
+	xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
+	       MAP_ANONYMOUS | MAP_PRIVATE, 0);
+
+      prev_to = to;
+    }
+
+  xfclose (f);
+
+  assert (argc < 1);
+  return 0;
+}
+
+#define EXPECTED_SIGNAL SIGABRT
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
+-- 
+2.27.0
+
--- a/backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
+++ b/backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
@ -0,0 +1,210 @@
+From 17f7ca193d60fefd6cc5e48aacd1ce9f7dd29862 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 14 Aug 2024 14:37:31 +0800
+Subject: [PATCH 09/10] x86: Add `Avoid_STOSB` tunable to allow NT memset
+ without ERMS
+
+The goal of this flag is to allow targets which don't prefer/have ERMS
+to still access the non-temporal memset implementation.
+
+There are 4 cases for tuning memset:
+    1) `Avoid_STOSB && Avoid_Non_Temporal_Memset`
+        - Memset with temporal stores
+    2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset`
+        - Memset with temporal/non-temporal stores. Non-temporal path
+          goes through `rep stosb` path. We accomplish this by setting
+          `x86_rep_stosb_threshold` to
+          `x86_memset_non_temporal_threshold`.
+    3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset`
+        - Memset with temporal stores/`rep stosb`
+    3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset`
+        - Memset with temporal stores/`rep stosb`/non-temporal stores.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c                    |  4 +++
+ sysdeps/x86/cpu-tunables.c                    |  2 ++
+ sysdeps/x86/dl-cacheinfo.h                    | 34 ++++++++++++++++---
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ sysdeps/x86/tst-hwcap-tunables.c              |  6 ++--
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 18 +++++++---
+ 6 files changed, 53 insertions(+), 12 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index c9f2297524..287edc5b08 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -1014,6 +1014,10 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+   if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
+     cpu_features->preferred[index_arch_I686] |= bit_arch_I686;
+ 
+  /* No ERMS, we want to avoid stosb for memset.  */
+  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;
+
+ #if !HAS_CPUID
+ no_cpuid:
+ #endif
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index b8475730ea..a4bbf13080 100644
+--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
+@@ -214,6 +214,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Prefer_FSRM,
+ 						disable, 11);
+	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Avoid_STOSB,
+						disable, 11);
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features,
+ 						     Slow_SSE4_2,
+ 						     SSE4_2,
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index d8288f0b0c..5803bfcea8 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1096,18 +1096,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
+ 				     long int, NULL);
+ 
+  /*
+     For memset, the non-temporal implementation is only accessed through the
+     stosb code. ie:
+     ```
+     if (size >= rep_stosb_thresh)
+     {
+    	if (size >= non_temporal_thresh)
+     {
+     do_non_temporal ();
+     }
+    	do_stosb ();
+     }
+     do_normal_vec_loop ();
+     ```
+     So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+     to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+    `rep stosb` will never be used.
+   */
+  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+			   memset_non_temporal_threshold,
+			   minimum_non_temporal_threshold, SIZE_MAX);
+  /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the
+     final value of `x86_memset_non_temporal_threshold`. In some cases this can
+     be a matter of correctness.  */
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB))
+    rep_stosb_threshold
+	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+			   SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+ 			   minimum_non_temporal_threshold,
+ 			   maximum_non_temporal_threshold);
+-  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+-			   memset_non_temporal_threshold,
+-			   minimum_non_temporal_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+-  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+-			   SIZE_MAX);
+ 
+   unsigned long int rep_movsb_stop_threshold;
+   /* Setting the upper bound of ERMS to the computed value of
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index aae1c85551..38a0c9226c 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_FSRM)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
+ BIT (Avoid_Non_Temporal_Memset)
+BIT (Avoid_STOSB)
+diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
+index 94307283d7..1920f5057e 100644
+--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
+@@ -60,7 +60,8 @@ static const struct test_t
+     /* Disable everything.  */
+     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
+-    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
+    "-Avoid_STOSB",
+     test_1,
+     array_length (test_1)
+   },
+@@ -68,7 +69,8 @@ static const struct test_t
+     /* Same as before, but with some empty suboptions.  */
+     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
+-    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
+    "-Avoid_STOSB,-,",
+     test_1,
+     array_length (test_1)
+   }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 5c5096ec5a..6b3b9a17a2 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+   attribute_hidden;
+ 
+static inline int
+prefer_erms_nt_impl (const struct cpu_features *cpu_features)
+{
+  return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+	 || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
+}
+
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+ 	  return OPTIMIZE (avx512_unaligned);
+@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+ 
+ 	  return OPTIMIZE (evex_unaligned);
+@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void)
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (avx2_unaligned_erms_rtm);
+ 
+ 	  return OPTIMIZE (avx2_unaligned_rtm);
+@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void)
+       if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+ 				       Prefer_No_VZEROUPPER, !))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (avx2_unaligned_erms);
+ 
+ 	  return OPTIMIZE (avx2_unaligned);
+ 	}
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+      || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+     return OPTIMIZE (sse2_unaligned_erms);
+ 
+   return OPTIMIZE (sse2_unaligned);
+-- 
+2.17.1
+
--- a/backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
+++ b/backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
@ -0,0 +1,95 @@
+From 01b5cac929a3be361dd575bed6673c40a25a6d61 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 14 Aug 2024 14:37:30 +0800
+Subject: [PATCH 08/10] x86: Use `Avoid_Non_Temporal_Memset` to control
+ non-temporal path
+
+This is just a refactor and there should be no behavioral change from
+this commit.
+
+The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
+for controlling whether we use non-temporal memset rather than having
+extra logic based on vendor.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
+ sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
+ 2 files changed, 23 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index b4030776a7..c9f2297524 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features)
+   unsigned int stepping = 0;
+   enum cpu_features_kind kind;
+ 
+  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+     as of writing this, we only have benchmarks indicatings it profitability
+     on Intel/AMD.  */
+  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+      |= bit_arch_Avoid_Non_Temporal_Memset;
+
+   cpu_features->cachesize_non_temporal_divisor = 4;
+ #if !HAS_CPUID
+   if (__get_cpuid_max (0, 0) == 0)
+@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 
+       update_active (cpu_features);
+ 
+      /* Benchmarks indicate non-temporal memset can be profitable on Intel
+	hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
+       if (family == 0x06)
+ 	{
+ 	  model += extended_model;
+@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+       ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ 
+      /* Benchmarks indicate non-temporal memset can be profitable on AMD
+	hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+ 	{
+ 	  /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index de4584116f..d8288f0b0c 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1048,14 +1048,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+-  /* Non-temporal stores are more performant on Intel and AMD hardware above
+-     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+-  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+-      && (cpu_features->basic.kind == arch_kind_intel
+-	  || cpu_features->basic.kind == arch_kind_amd))
+-    memset_non_temporal_threshold = non_temporal_threshold;
+-
+   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+      cases slower than the vectorized path (and for some alignments,
+      it is really slow, check BZ #30994).  */
+@@ -1077,6 +1069,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (tunable_size != 0)
+     shared = tunable_size;
+ 
+  /* Non-temporal stores are more performant on some hardware above
+     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+     Intel and AMD hardware. */
+  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
+   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
+   if (tunable_size > minimum_non_temporal_threshold
+       && tunable_size <= maximum_non_temporal_threshold)
+-- 
+2.17.1
+
--- a/backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch
+++ b/backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch
@ -0,0 +1,192 @@
+From aa3d7bd5299b33bffc118aa618b59bfa66059bcb Mon Sep 17 00:00:00 2001
+From: Florian Weimer <fweimer@redhat.com>
+Date: Thu, 13 Feb 2025 21:56:52 +0100
+Subject: [PATCH] elf: Keep using minimal malloc after early DTV resize (bug
+ 32412)
+
+If an auditor loads many TLS-using modules during startup, it is
+possible to trigger DTV resizing.  Previously, the DTV was marked
+as allocated by the main malloc afterwards, even if the minimal
+malloc was still in use.  With this change, _dl_resize_dtv marks
+the resized DTV as allocated with the minimal malloc.
+
+The new test reuses TLS-using modules from other auditing tests.
+
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ elf/Makefile                    |  5 +++
+ elf/dl-tls.c                    |  7 ++++
+ elf/tst-audit-tlsdesc-dlopen2.c | 46 +++++++++++++++++++++++++
+ elf/tst-auditmod-tlsdesc2.c     | 59 +++++++++++++++++++++++++++++++++
+ 4 files changed, 117 insertions(+)
+ create mode 100644 elf/tst-audit-tlsdesc-dlopen2.c
+ create mode 100644 elf/tst-auditmod-tlsdesc2.c
+
+diff --git a/elf/Makefile b/elf/Makefile
+index 5c833871d0..1ea0e7037e 100644
+--- a/elf/Makefile
+++ b/elf/Makefile
+@@ -379,6 +379,7 @@ tests += \
+   tst-align3 \
+   tst-audit-tlsdesc \
+   tst-audit-tlsdesc-dlopen \
+  tst-audit-tlsdesc-dlopen2 \
+   tst-audit1 \
+   tst-audit2 \
+   tst-audit8 \
+@@ -863,6 +864,7 @@ modules-names += \
+   tst-auditmanymod8 \
+   tst-auditmanymod9 \
+   tst-auditmod-tlsdesc  \
+  tst-auditmod-tlsdesc2 \
+   tst-auditmod1 \
+   tst-auditmod11 \
+   tst-auditmod12 \
+@@ -3189,6 +3191,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
+ tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+ $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
+ tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
+  $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
+ 
+ $(objpfx)tst-dlmopen-twice.out: \
+   $(objpfx)tst-dlmopen-twice-mod1.so \
+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
+index 5178d9b66a..a083a82933 100644
+--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
+@@ -566,6 +566,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
+       if (newp == NULL)
+ 	oom ();
+       memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
+#ifdef SHARED
+      /* Auditors can trigger a DTV resize event while the full malloc
+	 is not yet in use.  Mark the new DTV allocation as the
+	 initial allocation.  */
+      if (!__rtld_malloc_is_complete ())
+	GL(dl_initial_dtv) = &newp[1];
+#endif
+     }
+   else
+     {
+diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
+new file mode 100644
+index 0000000000..7ba2c4129a
+--- /dev/null
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
+@@ -0,0 +1,46 @@
+/* Loading TLS-using modules from auditors (bug 32412).  Main program.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+
+static int
+do_test (void)
+{
+  puts ("info: start of main program");
+
+  /* Load TLS-using modules, to trigger DTV resizing.  The dynamic
+     linker will load them again (requiring their own TLS) because the
+     dlopen calls from the auditor were in the auditing namespace.  */
+  for (int i = 1; i <= 19; ++i)
+    {
+      char dso[30];
+      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+      char sym[30];
+      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+      void *handle = xdlopen (dso, RTLD_LAZY);
+      int (*func) (void) = xdlsym (handle, sym);
+      /* Trigger TLS allocation.  */
+      func ();
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
+diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
+new file mode 100644
+index 0000000000..50275cd34d
+--- /dev/null
+++ b/elf/tst-auditmod-tlsdesc2.c
+@@ -0,0 +1,59 @@
+/* Loading TLS-using modules from auditors (bug 32412).  Audit module.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+unsigned int
+la_version (unsigned int version)
+{
+  /* Open some modules, to trigger DTV resizing before the switch to
+     the main malloc.  */
+  for (int i = 1; i <= 19; ++i)
+    {
+      char dso[30];
+      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+      char sym[30];
+      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+      void *handle = dlopen (dso, RTLD_LAZY);
+      if (handle == NULL)
+        {
+          printf ("error: dlmopen from auditor: %s\n", dlerror  ());
+          fflush (stdout);
+          _exit (1);
+        }
+      int (*func) (void) = dlsym (handle, sym);
+      if (func == NULL)
+        {
+          printf ("error: dlsym from auditor: %s\n", dlerror  ());
+          fflush (stdout);
+          _exit (1);
+        }
+      /* Trigger TLS allocation.  */
+      func ();
+    }
+
+  puts ("info: TLS-using modules loaded from auditor");
+  fflush (stdout);
+
+  return LAV_CURRENT;
+}
+-- 
+2.27.0
+
--- a/backport-x86-Add-cache-information-support-for-Hygon-processo.patch
+++ b/backport-x86-Add-cache-information-support-for-Hygon-processo.patch
@ -0,0 +1,97 @@
+From daa15a5bffc436cf7b943b306c85c90ce8bb369e Mon Sep 17 00:00:00 2001
+From: Feifei Wang <wangfeifei@hygon.cn>
+Date: Mon, 19 Aug 2024 14:57:54 +0800
+Subject: [PATCH 02/10] x86: Add cache information support for Hygon processors
+
+Add hygon branch in dl_init_cacheinfo function to initialize
+cache size variables for hygon processors. In the meanwhile,
+add handle_hygon() function to get cache information.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+Reviewed-by: Jing Li <lijing@hygon.cn>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 60 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 60 insertions(+)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 7b5ed210ca..85c404dd26 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -567,6 +567,48 @@ handle_zhaoxin (int name)
+   return 0;
+ }
+ 
+static long int __attribute__ ((noinline))
+handle_hygon (int name)
+{
+  unsigned int eax;
+  unsigned int ebx;
+  unsigned int ecx;
+  unsigned int edx;
+  unsigned int count = 0x1;
+
+  if (name >= _SC_LEVEL3_CACHE_SIZE)
+    count = 0x3;
+  else if (name >= _SC_LEVEL2_CACHE_SIZE)
+    count = 0x2;
+  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+    count = 0x0;
+
+  /* Use __cpuid__ '0x8000_001D' to compute cache details.  */
+  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+  switch (name)
+    {
+    case _SC_LEVEL1_ICACHE_ASSOC:
+    case _SC_LEVEL1_DCACHE_ASSOC:
+    case _SC_LEVEL2_CACHE_ASSOC:
+    case _SC_LEVEL3_CACHE_ASSOC:
+      return ((ebx >> 22) & 0x3ff) + 1;
+    case _SC_LEVEL1_ICACHE_LINESIZE:
+    case _SC_LEVEL1_DCACHE_LINESIZE:
+    case _SC_LEVEL2_CACHE_LINESIZE:
+    case _SC_LEVEL3_CACHE_LINESIZE:
+      return (ebx & 0xfff) + 1;
+    case _SC_LEVEL1_ICACHE_SIZE:
+    case _SC_LEVEL1_DCACHE_SIZE:
+    case _SC_LEVEL2_CACHE_SIZE:
+    case _SC_LEVEL3_CACHE_SIZE:
+      return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+    default:
+      __builtin_unreachable ();
+    }
+  return -1;
+}
+
+ static void
+ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
+                 long int core)
+@@ -890,6 +932,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 
+       shared_per_thread = shared;
+     }
+  else if (cpu_features->basic.kind == arch_kind_hygon)
+    {
+      data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE);
+      shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE);
+      shared_per_thread = shared;
+
+      level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE);
+      level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE);
+      level1_dcache_size = data;
+      level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC);
+      level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE);
+      level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);;
+      level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC);
+      level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE);
+      level3_cache_size = shared;
+      level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC);
+      level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE);
+    }
+ 
+   cpu_features->level1_icache_size = level1_icache_size;
+   cpu_features->level1_icache_linesize = level1_icache_linesize;
+-- 
+2.17.1
+
--- a/backport-x86-Add-new-architecture-type-for-Hygon-processors.patch
+++ b/backport-x86-Add-new-architecture-type-for-Hygon-processors.patch
@ -0,0 +1,69 @@
+From 3215d6157f5f94706aa5db6783838885a8a3c4f1 Mon Sep 17 00:00:00 2001
+From: Feifei Wang <wangfeifei@hygon.cn>
+Date: Mon, 19 Aug 2024 14:57:53 +0800
+Subject: [PATCH 01/10] x86: Add new architecture type for Hygon processors
+
+Add a new architecture type arch_kind_hygon to spilt Hygon branch
+from AMD. This is to facilitate the Hygon processors to make settings
+that are suitable for its own characteristics.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+Reviewed-by: Jing Li <lijing@hygon.cn>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c         | 19 ++++++++++++++++---
+ sysdeps/x86/include/cpu-features.h |  1 +
+ 2 files changed, 17 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index f752ebd24d..c4dd85145e 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -851,9 +851,8 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
+ 	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
+     }
+-  /* This spells out "AuthenticAMD" or "HygonGenuine".  */
+-  else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+-	   || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
+  /* This spells out "AuthenticAMD".  */
+  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+     {
+       unsigned int extended_model;
+ 
+@@ -963,6 +962,20 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 	    }
+ 	}
+     }
+  /* This spells out "HygonGenuine".  */
+  else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
+    {
+      unsigned int extended_model;
+
+      kind = arch_kind_hygon;
+
+      get_common_indices (cpu_features, &family, &model, &extended_model,
+			  &stepping);
+
+      get_extended_indices (cpu_features);
+
+      update_active (cpu_features);
+    }
+   else
+     {
+       kind = arch_kind_other;
+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+index eb30d342a6..594feeb2f4 100644
+--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
+@@ -856,6 +856,7 @@ enum cpu_features_kind
+   arch_kind_intel,
+   arch_kind_amd,
+   arch_kind_zhaoxin,
+  arch_kind_hygon,
+   arch_kind_other
+ };
+ 
+-- 
+2.17.1
+
--- a/backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
+++ b/backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
@ -0,0 +1,211 @@
+From 4ad2c9d04b76d7c4a42d80a82c022cd60b43b8b2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 May 2024 12:38:51 -0500
+Subject: [PATCH 04/10] x86: Add seperate non-temporal tunable for memset
+
+The tuning for non-temporal stores for memset vs memcpy is not always
+the same. This includes both the exact value and whether non-temporal
+stores are profitable at all for a given arch.
+
+This patch add `x86_memset_non_temporal_threshold`. Currently we
+disable non-temporal stores for non Intel vendors as the only
+benchmarks showing its benefit have been on Intel hardware.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ manual/tunables.texi                             | 16 +++++++++++++++-
+ sysdeps/x86/cacheinfo.h                          |  8 +++++++-
+ sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
+ sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
+ sysdeps/x86/dl-tunables.list                     |  3 +++
+ sysdeps/x86/include/cpu-features.h               |  4 +++-
+ .../x86_64/multiarch/memset-vec-unaligned-erms.S |  6 +++---
+ 7 files changed, 49 insertions(+), 6 deletions(-)
+
+diff --git a/manual/tunables.texi b/manual/tunables.texi
+index 6493904bae..2a2877884c 100644
+--- a/manual/tunables.texi
+++ b/manual/tunables.texi
+@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
+ glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
+ glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+ glibc.cpu.x86_shstk:
+ glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
+@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages.
+ @cindex shared_cache_size tunables
+ @cindex tunables, shared_cache_size
+ @cindex non_temporal_threshold tunables
+-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
+ 
+ @deftp {Tunable namespace} glibc.cpu
+ Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
+@@ -562,6 +564,18 @@ like memmove and memcpy.
+ This tunable is specific to i386 and x86-64.
+ @end deftp
+ 
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
+ @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
+ The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
+ set threshold in bytes to start using "rep movsb".  The value must be
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index ec1bc142c4..fd2b2ae66b 100644
+--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
+@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
+ long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
+ long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
+ 
+-/* Threshold to use non temporal store.  */
+/* Threshold to use non temporal store in memmove.  */
+ long int __x86_shared_non_temporal_threshold attribute_hidden;
+ 
+/* Threshold to use non temporal store in memset.  */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
+ /* Threshold to use Enhanced REP MOVSB.  */
+ long int __x86_rep_movsb_threshold attribute_hidden = 2048;
+ 
+@@ -77,6 +80,9 @@ init_cacheinfo (void)
+   __x86_shared_non_temporal_threshold
+     = cpu_features->non_temporal_threshold;
+ 
+  __x86_memset_non_temporal_threshold
+      = cpu_features->memset_non_temporal_threshold;
+
+   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
+   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index ce2e6927e4..9f27da21ce 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1048,6 +1048,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+  /* Non-temporal stores in memset have only been tested on Intel hardware.
+     Until we benchmark data on other x86 processor, disable non-temporal
+     stores in memset. */
+  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+  if (cpu_features->basic.kind == arch_kind_intel)
+      memset_non_temporal_threshold = non_temporal_threshold;
+
+    /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+       cases slower than the vectorized path (and for some alignments,
+       it is really slow, check BZ #30994).  */
+@@ -1074,6 +1081,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       && tunable_size <= maximum_non_temporal_threshold)
+     non_temporal_threshold = tunable_size;
+ 
+  tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+  if (tunable_size > minimum_non_temporal_threshold
+      && tunable_size <= maximum_non_temporal_threshold)
+    memset_non_temporal_threshold = tunable_size;
+
+   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
+   if (tunable_size > minimum_rep_movsb_threshold)
+     rep_movsb_threshold = tunable_size;
+@@ -1089,6 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+ 			   minimum_non_temporal_threshold,
+ 			   maximum_non_temporal_threshold);
+  TUNABLE_SET_WITH_BOUNDS (
+      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+@@ -1102,6 +1117,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   cpu_features->data_cache_size = data;
+   cpu_features->shared_cache_size = shared;
+   cpu_features->non_temporal_threshold = non_temporal_threshold;
+  cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
+   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
+   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
+   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
+diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
+index 5aab63e532..05d54b5eba 100644
+--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
+@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void)
+                             cpu_features->shared_cache_size);
+   print_cpu_features_value ("non_temporal_threshold",
+                             cpu_features->non_temporal_threshold);
+  print_cpu_features_value ("memset_non_temporal_threshold",
+                            cpu_features->memset_non_temporal_threshold);
+   print_cpu_features_value ("rep_movsb_threshold",
+                             cpu_features->rep_movsb_threshold);
+   print_cpu_features_value ("rep_movsb_stop_threshold",
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index d1442d88ba..53852d6a07 100644
+--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
+@@ -30,6 +30,9 @@ glibc {
+     x86_non_temporal_threshold {
+       type: SIZE_T
+     }
+    x86_memset_non_temporal_threshold {
+      type: SIZE_T
+    }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+       # Since there is overhead to set up REP MOVSB operation, REP
+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+index 594feeb2f4..e2d641dcd0 100644
+--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
+@@ -918,8 +918,10 @@ struct cpu_features
+   /* Shared cache size for use in memory and string routines, typically
+      L2 or L3 size.  */
+   unsigned long int shared_cache_size;
+-  /* Threshold to use non temporal store.  */
+  /* Threshold to use non temporal store in memmove.  */
+   unsigned long int non_temporal_threshold;
+  /* Threshold to use non temporal store in memset.  */
+  unsigned long int memset_non_temporal_threshold;
+   /* Threshold to use "rep movsb".  */
+   unsigned long int rep_movsb_threshold;
+   /* Threshold to stop using "rep movsb".  */
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index aba45e3da0..d95750b516 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -24,9 +24,9 @@
+    5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
+       4 VEC stores and store 4 * VEC at a time until done.
+    6. On machines ERMS feature, if size is range
+-	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+	  [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
+ 	  then REP STOSB will be used.
+-   7. If size >= __x86_shared_non_temporal_threshold, use a
+   7. If size >= __x86_memset_non_temporal_threshold, use a
+ 	  non-temporal stores.  */
+ 
+ #include <sysdep.h>
+@@ -318,7 +318,7 @@ L(return_vzeroupper):
+ 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+ 	   range for 2-byte jump encoding.  */
+ L(stosb_local):
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_memset_non_temporal_threshold(%rip), %RDX_LP
+ 	jae	L(nt_memset)
+ 	movzbl	%sil, %eax
+ 	mov	%RDX_LP, %RCX_LP
+-- 
+2.17.1
+
--- a/backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
+++ b/backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
@ -0,0 +1,263 @@
+From ce7c6c491ed0750a10f9a52b5edc710d978e70e2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 15 Jul 2024 16:19:17 +0800
+Subject: [PATCH 07/10] x86: Disable non-temporal memset on Skylake Server
+
+The original commit enabling non-temporal memset on Skylake Server had
+erroneous benchmarks (actually done on ICX).
+
+Further benchmarks indicate non-temporal stores may in fact by a
+regression on Skylake Server.
+
+This commit may be over-cautious in some cases, but should avoid any
+regressions for 2.40.
+
+Tested using qemu on all x86_64 cpu arch supported by both qemu +
+GLIBC.
+
+Reviewed-by: DJ Delorie <dj@redhat.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c                    |  13 +-
+ sysdeps/x86/cpu-tunables.c                    |   6 +
+ sysdeps/x86/dl-cacheinfo.h                    |  15 +-
+ ...cpu-features-preferred_feature_index_1.def |   1 +
+ sysdeps/x86/tst-hwcap-tunables.c              | 148 ++++++++++++++++++
+ 5 files changed, 173 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index c4dd85145e..b4030776a7 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 
+ 	      /* Newer Bigcore microarch (larger non-temporal store
+ 		 threshold).  */
+-	    case INTEL_BIGCORE_SKYLAKE:
+-	    case INTEL_BIGCORE_KABYLAKE:
+-	    case INTEL_BIGCORE_COMETLAKE:
+ 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
+ 	    case INTEL_BIGCORE_CANNONLAKE:
+	      /* Benchmarks indicate non-temporal memset is not
+		     necessarily profitable on SKX (and in some cases much
+		     worse). This is likely unique to SKX due its it unique
+		     mesh interconnect (not present on ICX or BWD). Disable
+		     non-temporal on all Skylake servers. */
+	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+		  |= bit_arch_Avoid_Non_Temporal_Memset;
+	    case INTEL_BIGCORE_COMETLAKE:
+	    case INTEL_BIGCORE_SKYLAKE:
+	    case INTEL_BIGCORE_KABYLAKE:
+ 	    case INTEL_BIGCORE_ICELAKE:
+ 	    case INTEL_BIGCORE_TIGERLAKE:
+ 	    case INTEL_BIGCORE_ROCKETLAKE:
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 0d4f328585..b8475730ea 100644
+--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
+@@ -272,6 +272,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 		 disable, 24);
+ 	    }
+ 	  break;
+	case 25:
+	  {
+	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+				              Avoid_Non_Temporal_Memset,
+						      disable, 25);
+	  }
+ 	case 26:
+ 	    {
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index a76df092e6..de4584116f 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1051,13 +1051,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   /* Non-temporal stores are more performant on Intel and AMD hardware above
+      non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (cpu_features->basic.kind == arch_kind_intel
+-      || cpu_features->basic.kind == arch_kind_amd)
+-      memset_non_temporal_threshold = non_temporal_threshold;
+-
+-   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+-      cases slower than the vectorized path (and for some alignments,
+-      it is really slow, check BZ #30994).  */
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+      && (cpu_features->basic.kind == arch_kind_intel
+	  || cpu_features->basic.kind == arch_kind_amd))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
+  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+     cases slower than the vectorized path (and for some alignments,
+     it is really slow, check BZ #30994).  */
+   if (cpu_features->basic.kind == arch_kind_amd)
+     rep_movsb_threshold = non_temporal_threshold;
+ 
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index d20c5b3196..aae1c85551 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_FSRM)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
+diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
+new file mode 100644
+index 0000000000..94307283d7
+--- /dev/null
+++ b/sysdeps/x86/tst-hwcap-tunables.c
+@@ -0,0 +1,148 @@
+/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
+   Copyright (C) 2023-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <getopt.h>
+#include <ifunc-impl-list.h>
+#include <spawn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <intprops.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xunistd.h>
+#include <support/capture_subprocess.h>
+
+/* Nonzero if the program gets called via `exec'.  */
+#define CMDLINE_OPTIONS \
+  { "restart", no_argument, &restart, 1 },
+static int restart;
+
+/* Disable everything.  */
+static const char *test_1[] =
+{
+  "__memcpy_avx512_no_vzeroupper",
+  "__memcpy_avx512_unaligned",
+  "__memcpy_avx512_unaligned_erms",
+  "__memcpy_evex_unaligned",
+  "__memcpy_evex_unaligned_erms",
+  "__memcpy_avx_unaligned",
+  "__memcpy_avx_unaligned_erms",
+  "__memcpy_avx_unaligned_rtm",
+  "__memcpy_avx_unaligned_erms_rtm",
+  "__memcpy_ssse3",
+};
+
+static const struct test_t
+{
+  const char *env;
+  const char *const *funcs;
+  size_t nfuncs;
+} tests[] =
+{
+  {
+    /* Disable everything.  */
+    "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+    "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
+    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+    test_1,
+    array_length (test_1)
+  },
+  {
+    /* Same as before, but with some empty suboptions.  */
+    ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+    "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+    test_1,
+    array_length (test_1)
+  }
+};
+
+/* Called on process re-execution.  */
+_Noreturn static void
+handle_restart (int ntest)
+{
+  struct libc_ifunc_impl impls[32];
+  int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
+  if (cnt == 0)
+    _exit (EXIT_SUCCESS);
+  TEST_VERIFY_EXIT (cnt >= 1);
+  for (int i = 0; i < cnt; i++)
+    {
+      for (int f = 0; f < tests[ntest].nfuncs; f++)
+	{
+	  if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
+	    TEST_COMPARE (impls[i].usable, false);
+	}
+    }
+
+  _exit (EXIT_SUCCESS);
+}
+
+static int
+do_test (int argc, char *argv[])
+{
+  /* We must have either:
+     - One our fource parameters left if called initially:
+       + path to ld.so         optional
+       + "--library-path"      optional
+       + the library path      optional
+       + the application name
+       + the test to check  */
+
+  TEST_VERIFY_EXIT (argc == 2 || argc == 5);
+
+  if (restart)
+    handle_restart (atoi (argv[1]));
+
+  char nteststr[INT_BUFSIZE_BOUND (int)];
+
+  char *spargv[10];
+  {
+    int i = 0;
+    for (; i < argc - 1; i++)
+      spargv[i] = argv[i + 1];
+    spargv[i++] = (char *) "--direct";
+    spargv[i++] = (char *) "--restart";
+    spargv[i++] = nteststr;
+    spargv[i] = NULL;
+  }
+
+  for (int i = 0; i < array_length (tests); i++)
+    {
+      snprintf (nteststr, sizeof nteststr, "%d", i);
+
+      printf ("[%d] Spawned test for %s\n", i, tests[i].env);
+      char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
+      setenv ("GLIBC_TUNABLES", tunable, 1);
+
+      struct support_capture_subprocess result
+	= support_capture_subprogram (spargv[0], spargv, NULL);
+      support_capture_subprocess_check (&result, "tst-tunables", 0,
+					sc_allow_stderr);
+      support_capture_subprocess_free (&result);
+
+      free (tunable);
+    }
+
+  return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
+-- 
+2.17.1
+
--- a/backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
+++ b/backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
@ -0,0 +1,92 @@
+From 1e57e1c6aa6ca5a476aba725271c1ace9be345d3 Mon Sep 17 00:00:00 2001
+From: Feifei Wang <wangfeifei@hygon.cn>
+Date: Mon, 19 Aug 2024 14:57:55 +0800
+Subject: [PATCH 10/10] x86: Enable non-temporal memset for Hygon processors
+
+This patch uses 'Avoid_Non_Temporal_Memset' flag to access
+the non-temporal memset implementation for hygon processors.
+
+Test Results:
+
+hygon1 arch
+x86_memset_non_temporal_threshold = 8MB
+size                          new performance time / old performance time
+1MB                           0.994
+4MB                           0.996
+8MB                           0.670
+16MB                          0.343
+32MB                          0.355
+
+hygon2 arch
+x86_memset_non_temporal_threshold = 8MB
+size                          new performance time / old performance time
+1MB                           1
+4MB                           1
+8MB                           1.312
+16MB                          0.822
+32MB                          0.830
+
+hygon3 arch
+x86_memset_non_temporal_threshold = 8MB
+size                          new performance time / old performance time
+1MB                           1
+4MB                           0.990
+8MB                           0.737
+16MB                          0.390
+32MB                          0.401
+
+For hygon arch with this patch, non-temporal stores can improve
+performance by 20% - 65%.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+Reviewed-by: Jing Li <lijing@hygon.cn>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c | 9 +++++++--
+ sysdeps/x86/dl-cacheinfo.h | 2 +-
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 287edc5b08..f5539aea6f 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -640,9 +640,9 @@ init_cpu_features (struct cpu_features *cpu_features)
+   unsigned int stepping = 0;
+   enum cpu_features_kind kind;
+ 
+-  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
+      as of writing this, we only have benchmarks indicatings it profitability
+-     on Intel/AMD.  */
+     on Intel/AMD/Hygon.  */
+   cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+       |= bit_arch_Avoid_Non_Temporal_Memset;
+ 
+@@ -998,6 +998,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+       get_extended_indices (cpu_features);
+ 
+       update_active (cpu_features);
+
+      /* Benchmarks indicate non-temporal memset can be profitable on Hygon
+       hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	    &= ~bit_arch_Avoid_Non_Temporal_Memset;
+     }
+   else
+     {
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 5803bfcea8..d4dad8df3b 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 
+   /* Non-temporal stores are more performant on some hardware above
+      non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+-     Intel and AMD hardware. */
+     Intel, AMD and Hygon hardware. */
+   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+     memset_non_temporal_threshold = non_temporal_threshold;
+-- 
+2.17.1
+
--- a/backport-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
+++ b/backport-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
@ -0,0 +1,47 @@
+From 54e99a96ec3b97f53ee018bfa8dbbef2dd13f1e8 Mon Sep 17 00:00:00 2001
+From: Joe Damato <jdamato@fastly.com>
+Date: Fri, 7 Jun 2024 23:04:47 +0000
+Subject: [PATCH 05/10] x86: Enable non-temporal memset tunable for AMD
+
+In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
+memset") a tunable threshold for enabling non-temporal memset was added,
+but only for Intel hardware.
+
+Since that commit, new benchmark results suggest that non-temporal
+memset is beneficial on AMD, as well, so allow this tunable to be set
+for AMD.
+
+See:
+https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
+which has been updated to include data using different stategies for
+large memset on AMD Zen2, Zen3, and Zen4.
+
+Signed-off-by: Joe Damato <jdamato@fastly.com>
+Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 9f27da21ce..dfdb4069c7 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1048,11 +1048,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+-  /* Non-temporal stores in memset have only been tested on Intel hardware.
+-     Until we benchmark data on other x86 processor, disable non-temporal
+-     stores in memset. */
+  /* Non-temporal stores are more performant on Intel and AMD hardware above
+     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (cpu_features->basic.kind == arch_kind_intel)
+  if (cpu_features->basic.kind == arch_kind_intel
+      || cpu_features->basic.kind == arch_kind_amd)
+       memset_non_temporal_threshold = non_temporal_threshold;
+ 
+    /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+-- 
+2.17.1
+
--- a/backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
+++ b/backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
@ -0,0 +1,149 @@
+From f1ea6401d790764e4fcf02c6fb28e69841c25640 Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Thu, 8 Feb 2024 10:08:38 -0300
+Subject: [PATCH 03/10] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
+
+The REP MOVSB usage on memcpy/memmove does not show much performance
+improvement on Zen3/Zen4 cores compared to the vectorized loops.  Also,
+as from BZ 30994, if the source is aligned and the destination is not
+the performance can be 20x slower.
+
+The performance difference is noticeable with small buffer sizes, closer
+to the lower bounds limits when memcpy/memmove starts to use ERMS.  The
+performance of REP MOVSB is similar to vectorized instruction on the
+size limit (the L2 cache).  Also, there is no drawback to multiple cores
+sharing the cache.
+
+Checked on x86_64-linux-gnu on Zen3.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
+ 1 file changed, 18 insertions(+), 20 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 85c404dd26..ce2e6927e4 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -833,7 +833,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   long int data = -1;
+   long int shared = -1;
+   long int shared_per_thread = -1;
+-  long int core = -1;
+   unsigned int threads = 0;
+   unsigned long int level1_icache_size = -1;
+   unsigned long int level1_icache_linesize = -1;
+@@ -851,7 +850,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (cpu_features->basic.kind == arch_kind_intel)
+     {
+       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+-      core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
+       shared_per_thread = shared;
+ 
+@@ -864,7 +862,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
+       level1_dcache_linesize
+ 	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
+-      level2_cache_size = core;
+      level2_cache_size
+	= handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       level2_cache_assoc
+ 	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
+       level2_cache_linesize
+@@ -877,12 +876,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level4_cache_size
+ 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
+ 
+-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads,
+			     level2_cache_size);
+     }
+   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+     {
+       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+-      core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
+       shared_per_thread = shared;
+ 
+@@ -891,19 +890,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level1_dcache_size = data;
+       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
+       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
+-      level2_cache_size = core;
+      level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
+       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
+       level3_cache_size = shared;
+       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
+       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
+ 
+-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads,
+			     level2_cache_size);
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+-      core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+ 
+       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
+@@ -911,7 +910,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level1_dcache_size = data;
+       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
+       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
+-      level2_cache_size = core;
+      level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
+       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
+       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
+       level3_cache_size = shared;
+@@ -922,12 +921,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       if (shared <= 0)
+         {
+            /* No shared L3 cache.  All we have is the L2 cache.  */
+-           shared = core;
+           shared = level2_cache_size;
+         }
+       else if (cpu_features->basic.family < 0x17)
+         {
+            /* Account for exclusive L2 and L3 caches.  */
+-           shared += core;
+           shared += level2_cache_size;
+         }
+ 
+       shared_per_thread = shared;
+@@ -1049,6 +1048,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+      cases slower than the vectorized path (and for some alignments,
+      it is really slow, check BZ #30994).  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_threshold = non_temporal_threshold;
+
+   /* The default threshold to use Enhanced REP STOSB.  */
+   unsigned long int rep_stosb_threshold = 2048;
+ 
+@@ -1090,16 +1095,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 			   SIZE_MAX);
+ 
+   unsigned long int rep_movsb_stop_threshold;
+-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+-     performing poorly for data above L2 cache size. Henceforth, adding
+-     an upper bound threshold parameter to limit the usage of Enhanced
+-     REP MOVSB operations and setting its value to L2 cache size.  */
+-  if (cpu_features->basic.kind == arch_kind_amd)
+-    rep_movsb_stop_threshold = core;
+   /* Setting the upper bound of ERMS to the computed value of
+-     non-temporal threshold for architectures other than AMD.  */
+-  else
+-    rep_movsb_stop_threshold = non_temporal_threshold;
+     non-temporal threshold for all architectures.  */
+  rep_movsb_stop_threshold = non_temporal_threshold;
+ 
+   cpu_features->data_cache_size = data;
+   cpu_features->shared_cache_size = shared;
+-- 
+2.17.1
+
--- a/backport-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
+++ b/backport-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
@ -0,0 +1,41 @@
+From 8a2cea0ae0cbd4120770b81f0be422f60f020e17 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 14 Jun 2024 13:01:58 -0500
+Subject: [PATCH 06/10] x86: Fix value for `x86_memset_non_temporal_threshold`
+ when it is undesirable
+
+When we don't want to use non-temporal stores for memset, we set
+`x86_memset_non_temporal_threshold` to SIZE_MAX.
+
+The current code, however, we using `maximum_non_temporal_threshold`
+as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
+value of `0`.
+
+Fix is to just use `SIZE_MAX` as the upper bound for when setting the
+tunable.
+Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index dfdb4069c7..a76df092e6 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1101,9 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+ 			   minimum_non_temporal_threshold,
+ 			   maximum_non_temporal_threshold);
+-  TUNABLE_SET_WITH_BOUNDS (
+-      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+-      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
+  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+			   memset_non_temporal_threshold,
+			   minimum_non_temporal_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+-- 
+2.17.1
+
--- a/fix-CVE-2019-1010023.patch
+++ b/fix-CVE-2019-1010023.patch
@ -0,0 +1,66 @@
+From fe1ffef2eec9c6634a1e9af951eb68f0f5614470 Mon Sep 17 00:00:00 2001
+From: xujing <xujing99@huawei.com>
+Date: Thu, 2 Dec 2021 11:41:46 +0800
+Subject: [PATCH] glibc: fix CVE-2019-1010023
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+| PT_LOAD
+|
+| […] Loadable segment entries in the program header table appear in
+| ascending order, sorted on the p_vaddr member.
+
+http://www.sco.com/developers/gabi/latest/ch5.pheader.html
+
+Some check needed to fix vulnerability in load commands mapping reported by
+
+https://sourceware.org/bugzilla/show_bug.cgi?id=22851
+
+Signed-off-by: lvying <lvying6@huawei.com>
+Signed-off-by: xujing <xujing99@huawei.com>
+---
+ elf/dl-map-segments.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h
+index 084076a2..a41ae73b 100644
+--- a/elf/dl-map-segments.h
+++ b/elf/dl-map-segments.h
+@@ -33,6 +33,7 @@ _dl_map_segments (struct link_map *l, int fd,
+                   struct link_map *loader)
+ {
+   const struct loadcmd *c = loadcmds;
+  ElfW(Addr) l_map_end_aligned;
+ 
+   if (__glibc_likely (type == ET_DYN))
+     {
+@@ -61,6 +62,8 @@ _dl_map_segments (struct link_map *l, int fd,
+         return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
+ 
+       l->l_map_end = l->l_map_start + maplength;
+      l_map_end_aligned = ((l->l_map_end + GLRO(dl_pagesize) - 1)
+                          & ~(GLRO(dl_pagesize) - 1));
+       l->l_addr = l->l_map_start - c->mapstart;
+ 
+       if (has_holes)
+@@ -85,10 +88,16 @@ _dl_map_segments (struct link_map *l, int fd,
+   /* Remember which part of the address space this object uses.  */
+   l->l_map_start = c->mapstart + l->l_addr;
+   l->l_map_end = l->l_map_start + maplength;
+  l_map_end_aligned = ((l->l_map_end + GLRO(dl_pagesize) - 1)
+                      & ~(GLRO(dl_pagesize) - 1));
+   l->l_contiguous = !has_holes;
+ 
+   while (c < &loadcmds[nloadcmds])
+     {
+      if ((l->l_addr + c->mapend) > l_map_end_aligned ||
+          (l->l_addr + c->mapstart) < l->l_map_start)
+          return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
+
+       if (c->mapend > c->mapstart
+           /* Map the segment contents from the file.  */
+           && (__mmap ((void *) (l->l_addr + c->mapstart),
+-- 
+2.23.0
+
--- a/glibc.spec
+++ b/glibc.spec
@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	52
+Release: 	57
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@ -267,6 +267,25 @@ Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
 Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
 Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
 Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
+Patch181: backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch
+Patch182: backport-x86-Add-new-architecture-type-for-Hygon-processors.patch
+Patch183: backport-x86-Add-cache-information-support-for-Hygon-processo.patch
+Patch184: backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
+Patch185: backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
+Patch186: backport-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
+Patch187: backport-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
+Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
+Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
+Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
+Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch 
+Patch192: assert-Add-test-for-CVE-2025-0395.patch
+Patch193: AArch64-Improve-generic-strlen.patch
+Patch194: AArch64-Optimize-memset.patch
+Patch195: AArch64-Remove-zva_128-from-memset.patch
+Patch196: math-Improve-layout-of-expf-data.patch
+Patch197: AArch64-Add-SVE-memset.patch
+Patch198: AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
+Patch199: math-Improve-layout-of-exp-exp10-data.patch

 #openEuler patch list
 Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -310,6 +329,8 @@ Patch9034: 0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
 Patch9035: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
 Patch9036: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch

+Patch9037: fix-CVE-2019-1010023.patch
+
 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)

 BuildRequires: audit-libs-devel >= 1.1.3, sed >= 3.95, libcap-devel, gettext
@ -646,6 +667,7 @@ mkdir $builddir
 pushd $builddir
 ../configure CC="%GCC" CXX="%GXX" CFLAGS="$BuildFlags" LDFLAGS="$LinkFlags" \
 	--prefix=%{_prefix} \
+    --enable-hardcoded-path-in-tests \
 	--with-headers=%{_prefix}/include $EnableKernel \
 	--with-nonshared-cflags=-Wp,-D_FORTIFY_SOURCE=2 \
 	--enable-bind-now \
@ -1085,7 +1107,9 @@ function removeLoadPath()
                currPath=$(echo $runpathInfo | awk -F "RUNPATH=" '{print $2}')
        fi

-        if [ x"$currPath" == x"\$ORIGIN" ]; then
+        # 2dcaf7064 using rpath instead of runpath with --enable-hardcoded-path-in-tests
+        # using "\$ORIGIN"* to match rpath address
+        if [[ x"$currPath" == x"\$ORIGIN"* ]]; then
                chrpath -d $file

                findReliantLib $file
@ -1486,6 +1510,37 @@ fi
 %endif

 %changelog
+* Fri Mar 28 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-57
+- math: Improve layout of exp/exp10 data
+- AArch64: Use prefer_sve_ifuncs for SVE memset
+- AArch64: Add SVE memset
+- math: Improve layout of expf data
+- AArch64: Remove zva_128 from memset
+- AArch64: Optimize memset
+- AArch64: Improve generic strlen
+- assert: Add test for CVE-2025-0395
+
+* Wed Mar 12 2025 xiajimei  <xiejiamei@hygon.cn> - 2.38-56
+- x86: Enable non-temporal memset for Hygon processors
+- x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS
+- x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
+- x86: Disable non-temporal memset on Skylake Server
+- x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable
+- x86: Enable non-temporal memset tunable for AMD
+- x86: Add seperate non-temporal tunable for memset
+- x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
+- x86: Add cache information support for Hygon processors
+- x86: Add new architecture type for Hygon processors
+
+* Sat Mar 08 2025 shixuantong <shixuantong1@huawei.com> - 2.38-55
+- elf: Keep using minimal malloc after early DTV resize
+
+* Tue Feb 18 2025 shixuantong <shixuantong1@huawei.com> - 2.38-54
+- glibc testcase use newly built ld.so instead of environment default installed ld.so
+
+* Thu Feb 06 2025 shixuantong <shixuantong1@huawei.com> - 2.38-53
+- fix CVE-2019-1010023
+
 * Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
 - stdlib: Test using setenv with updated environ [BZ #32588]
 - Fix underallocation of abort_msg_s struct (CVE-2025-0395)
--- a/math-Improve-layout-of-exp-exp10-data.patch
+++ b/math-Improve-layout-of-exp-exp10-data.patch
@ -0,0 +1,39 @@
+From 5a08d049dc5037e89eb95bb1506652f0043fa39e Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Fri, 13 Dec 2024 15:43:07 +0000
+Subject: [PATCH] math: Improve layout of exp/exp10 data
+
+GCC aligns global data to 16 bytes if their size is >= 16 bytes.  This patch
+changes the exp_data struct slightly so that the fields are better aligned
+and without gaps.  As a result on targets that support them, more load-pair
+instructions are used in exp.
+
+The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
+Neoverse V2.
+
+Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
+---
+ sysdeps/ieee754/dbl-64/math_config.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
+index 19af33fd86..52b720ecd1 100644
+--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
+@@ -195,10 +195,11 @@ check_uflow (double x)
+ extern const struct exp_data
+ {
+   double invln2N;
+-  double shift;
+   double negln2hiN;
+   double negln2loN;
+   double poly[4]; /* Last four coefficients.  */
+  double shift;
+
+   double exp2_shift;
+   double exp2_poly[EXP2_POLY_ORDER];
+   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
+-- 
+2.27.0
+
--- a/math-Improve-layout-of-expf-data.patch
+++ b/math-Improve-layout-of-expf-data.patch
@ -0,0 +1,34 @@
+From 3de5112326a4274c97f154f3d335c11965ee960c Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Wed, 24 Jul 2024 15:17:47 +0100
+Subject: [PATCH] math: Improve layout of expf data
+
+GCC aligns global data to 16 bytes if their size is >= 16 bytes.  This patch
+changes the exp2f_data struct slightly so that the fields are better aligned.
+As a result on targets that support them, load-pair instructions accessing
+poly_scaled and invln2_scaled are now 16-byte aligned.
+
+Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
+---
+ sysdeps/ieee754/flt-32/math_config.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
+index d1b06a1a90..5904eb9bac 100644
+--- a/sysdeps/ieee754/flt-32/math_config.h
+++ b/sysdeps/ieee754/flt-32/math_config.h
+@@ -166,9 +166,9 @@ extern const struct exp2f_data
+   uint64_t tab[1 << EXP2F_TABLE_BITS];
+   double shift_scaled;
+   double poly[EXP2F_POLY_ORDER];
+-  double shift;
+   double invln2_scaled;
+   double poly_scaled[EXP2F_POLY_ORDER];
+  double shift;
+ } __exp2f_data attribute_hidden;
+ 
+ #define LOGF_TABLE_BITS 4
+-- 
+2.27.0
+
Author	SHA1	Message	Date
openeuler-ci-bot	5538373d14	!1022 [sync] PR-1021: sync from glibc upstream 2.38 branch From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229	2025-03-28 12:24:20 +00:00
Qingqing Li	b453407cdf	sync from glibc upstream 2.38 branch. below is the patch list: - math: Improve layout of exp/exp10 data - AArch64: Use prefer_sve_ifuncs for SVE memset - AArch64: Add SVE memset - math: Improve layout of expf data - AArch64: Remove zva_128 from memset - AArch64: Optimize memset - AArch64: Improve generic strlen - assert: Add test for CVE-2025-0395 (cherry picked from commit a6a6276229d415c277b108ed8e6ef4f2fe517bae)	2025-03-28 17:56:22 +08:00
openeuler-ci-bot	d9f212c1d8	!1014 [sync] PR-1012: x86: Add support for Hygon processors From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229	2025-03-13 06:59:26 +00:00
Xie jiamei	cb3e0b2e06	x86: Add support for Hygon processors Signed-off-by: Xie jiamei <xiejiamei@hygon.cn> (cherry picked from commit 9cf451dd6fdd13ec64780b1f56c84778f99449fb)	2025-03-13 14:58:58 +08:00
openeuler-ci-bot	a2a517c64a	!1010 [sync] PR-1006: elf: Keep using minimal malloc after early DTV resize From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229	2025-03-08 02:00:56 +00:00
shixuantong	e214ed3103	elf: Keep using minimal malloc after early DTV resize (cherry picked from commit 9eae27b47fc8fcad542f939ee869f65e4405421c)	2025-03-08 09:59:14 +08:00
openeuler-ci-bot	2b4695acd8	!999 [sync] PR-995: glibc testcase use newly built ld.so instead of environment default installed ld.so From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229	2025-02-18 13:43:00 +00:00
shixuantong	b606cd617b	glibc testcase use newly built ld.so instead of environment default installed ld.so (cherry picked from commit 39be4915e4725de1e52ced4c6b8d0323703fc8c2)	2025-02-18 19:30:51 +08:00
openeuler-ci-bot	2f571cfc1d	!989 [sync] PR-986: fix CVE-2019-1010023 From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229	2025-02-07 09:19:08 +00:00
shixuantong	9b958700fd	fix CVE-2019-1010023 (cherry picked from commit 0e80112809f744dee46b79cb37b4e8b28f546962)	2025-02-07 14:05:33 +08:00