!1022 [sync] PR-1021: sync from glibc upstream 2.38 branch

From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229
2025-03-28 12:24:20 +00:00 · 2025-03-28 12:24:20 +00:00 · 5538373d14
commit 5538373d14
parent d9f212c1d8 b453407cdf
9 changed files with 897 additions and 1 deletions
--- a/AArch64-Add-SVE-memset.patch
+++ b/AArch64-Add-SVE-memset.patch
@ -0,0 +1,200 @@
 From 52c2b1556f773d9a75d030160e0e273a5ea84502 Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Tue, 24 Dec 2024 18:01:59 +0000
 Subject: [PATCH] AArch64: Add SVE memset
 Add SVE memset based on the generic memset with predicated load for sizes < 16.
 Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
 stores for the last 64 bytes.  Performance of random memset benchmark improves
 by ~2% on Neoverse V1.
 Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
 (cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
 ---
 sysdeps/aarch64/multiarch/Makefile           |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c  |   3 +-
 sysdeps/aarch64/multiarch/memset.c           |   4 +
 sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 +++++++++++++++++++
 4 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S
 diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
 index e4720b7468..214b6137b0 100644
 --- a/sysdeps/aarch64/multiarch/Makefile
 +++ b/sysdeps/aarch64/multiarch/Makefile
@@ -14,6 +14,7 @@ sysdep_routines += \
   memset_generic \
   memset_kunpeng \
   memset_mops \
 +  memset_sve_zva64 \
   memset_zva64 \
   strlen_asimd \
   strlen_generic \
 diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
 index 73038ac810..2fa6baa319 100644
 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -56,7 +56,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 #if HAVE_AARCH64_SVE_ASM
 -	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
 +	      IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
 +	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
 diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
 index 6deb6865e5..89fde57f42 100644
 --- a/sysdeps/aarch64/multiarch/memset.c
 +++ b/sysdeps/aarch64/multiarch/memset.c
@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
 +extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
 static inline __typeof (__redirect_memset) *
 select_memset_ifunc (void)
@@ -47,6 +48,9 @@ select_memset_ifunc (void)
     {
       if (IS_A64FX (midr) && zva_size == 256)
 	return __memset_a64fx;
 +
 +      if (zva_size == 64)
 +	return __memset_sve_zva64;
     }
   if (IS_KUNPENG920 (midr))
 diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
 new file mode 100644
 index 0000000000..7fb40fdd9e
 --- /dev/null
 +++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
@@ -0,0 +1,123 @@
 +/* Optimized memset for SVE.
 +   Copyright (C) 2025 Free Software Foundation, Inc.
 +
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library.  If not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +/* Assumptions:
 + *
 + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
 + * ZVA size is 64.
 + */
 +
 +#if HAVE_AARCH64_SVE_ASM
 +
 +.arch armv8.2-a+sve
 +
 +#define dstin	x0
 +#define val	x1
 +#define valw	w1
 +#define count	x2
 +#define dst	x3
 +#define dstend	x4
 +#define zva_val	x5
 +#define vlen	x5
 +#define off	x3
 +#define dstend2 x5
 +
 +ENTRY (__memset_sve_zva64)
 +	dup	v0.16B, valw
 +	cmp	count, 16
 +	b.lo	L(set_16)
 +
 +	add	dstend, dstin, count
 +	cmp	count, 64
 +	b.hs	L(set_128)
 +
 +	/* Set 16..63 bytes.  */
 +	mov	off, 16
 +	and	off, off, count, lsr 1
 +	sub	dstend2, dstend, off
 +	str	q0, [dstin]
 +	str	q0, [dstin, off]
 +	str	q0, [dstend2, -16]
 +	str	q0, [dstend, -16]
 +	ret
 +
 +	.p2align 4
 +L(set_16):
 +	whilelo p0.b, xzr, count
 +	st1b	z0.b, p0, [dstin]
 +	ret
 +
 +	.p2align 4
 +L(set_128):
 +	bic	dst, dstin, 15
 +	cmp	count, 128
 +	b.hi	L(set_long)
 +	stp	q0, q0, [dstin]
 +	stp	q0, q0, [dstin, 32]
 +	stp	q0, q0, [dstend, -64]
 +	stp	q0, q0, [dstend, -32]
 +	ret
 +
 +	.p2align 4
 +L(set_long):
 +	cmp	count, 256
 +	b.lo	L(no_zva)
 +	tst	valw, 255
 +	b.ne	L(no_zva)
 +
 +	str	q0, [dstin]
 +	str	q0, [dst, 16]
 +	bic	dst, dstin, 31
 +	stp	q0, q0, [dst, 32]
 +	bic	dst, dstin, 63
 +	sub	count, dstend, dst	/* Count is now 64 too large.  */
 +	sub	count, count, 128	/* Adjust count and bias for loop.  */
 +
 +	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
 +	bic	x8, x8, 15
 +	stp	q0, q0, [x8, -48]
 +	str	q0, [x8, -16]
 +	str	q0, [dstend, -16]
 +
 +	.p2align 4
 +L(zva64_loop):
 +	add	dst, dst, 64
 +	dc	zva, dst
 +	subs	count, count, 64
 +	b.hi	L(zva64_loop)
 +	ret
 +
 +L(no_zva):
 +	str	q0, [dstin]
 +	sub	count, dstend, dst	/* Count is 16 too large.  */
 +	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
 +L(no_zva_loop):
 +	stp	q0, q0, [dst, 16]
 +	stp	q0, q0, [dst, 48]
 +	add	dst, dst, 64
 +	subs	count, count, 64
 +	b.hi	L(no_zva_loop)
 +	stp	q0, q0, [dstend, -64]
 +	stp	q0, q0, [dstend, -32]
 +	ret
 +
 +END (__memset_sve_zva64)
 +#endif
 -- 
 2.27.0
--- a/AArch64-Improve-generic-strlen.patch
+++ b/AArch64-Improve-generic-strlen.patch
@ -0,0 +1,92 @@
 From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Wed, 7 Aug 2024 14:43:47 +0100
 Subject: [PATCH] AArch64: Improve generic strlen
 Improve performance by handling another 16 bytes before entering the loop.
 Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
 size computation to avoid increasing latency.  On Neoverse V1 performance
 of the random strlen benchmark improves by 4.6%.
 Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
 (cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
 ---
 sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)
 diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
 index 133ef93342..352fb40d3a 100644
 --- a/sysdeps/aarch64/strlen.S
 +++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@
 -/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
 +/* Generic optimized strlen using SIMD.
 +   Copyright (C) 2012-2024 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 -	cbz	synd, L(loop)
 +	cbz	synd, L(next16)
 	rbit	synd, synd
 	clz	result, synd
 	lsr	result, result, 2
 	ret
 +L(next16):
 +	ldr	data, [src, 16]
 +	cmeq	vhas_nul.16b, vdata.16b, 0
 +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 +	fmov	synd, dend
 +	cbz	synd, L(loop)
 +	add	src, src, 16
 +#ifndef __AARCH64EB__
 +	rbit	synd, synd
 +#endif
 +	sub	result, src, srcin
 +	clz	tmp, synd
 +	add	result, result, tmp, lsr 2
 +	ret
 +
 	.p2align 5
 L(loop):
 -	ldr	data, [src, 16]
 +	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 -	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 +	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbnz	synd, L(loop_end)
 -	ldr	data, [src, 32]!
 +	ldr	data, [src, 16]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 -	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 +	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbz	synd, L(loop)
 -	sub	src, src, 16
 +	add	src, src, 16
 L(loop_end):
 -	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 -	sub	result, src, srcin
 -	fmov	synd, dend
 +	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 +	sub	result, result, 3
 #endif
 -	add	result, result, 16
 	clz	tmp, synd
 -	add	result, result, tmp, lsr 2
 +	sub	result, tmp, result
 +	lsr	result, result, 2
 	ret
 END (STRLEN)
 -- 
 2.27.0
--- a/AArch64-Optimize-memset.patch
+++ b/AArch64-Optimize-memset.patch
@ -0,0 +1,287 @@
 From 95aa21432ccbf77225abd485d98df36ba760ff80 Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Mon, 9 Sep 2024 15:26:47 +0100
 Subject: [PATCH] AArch64: Optimize memset
 Improve small memsets by avoiding branches and use overlapping stores.
 Use DC ZVA for copies over 128 bytes.  Remove unnecessary code for ZVA sizes
 other than 64 and 128.  Performance of random memset benchmark improves by 24%
 on Neoverse N1.
 Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 (cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
 ---
 sysdeps/aarch64/memset.S | 195 +++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 111 deletions(-)
 diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
 index bbfb7184c3..caafb019e2 100644
 --- a/sysdeps/aarch64/memset.S
 +++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
 -/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
 +/* Generic optimized memset using SIMD.
 +   Copyright (C) 2012-2024 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -17,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 #include <sysdep.h>
 -#include "memset-reg.h"
 #ifndef MEMSET
 # define MEMSET memset
@@ -25,130 +25,132 @@
 /* Assumptions:
  *
 - * ARMv8-a, AArch64, unaligned accesses
 + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 -ENTRY (MEMSET)
 +#define dstin	x0
 +#define val	x1
 +#define valw	w1
 +#define count	x2
 +#define dst	x3
 +#define dstend	x4
 +#define zva_val	x5
 +#define off	x3
 +#define dstend2	x5
 +ENTRY (MEMSET)
 	PTR_ARG (0)
 	SIZE_ARG (2)
 	dup	v0.16B, valw
 +	cmp	count, 16
 +	b.lo	L(set_small)
 +
 	add	dstend, dstin, count
 +	cmp	count, 64
 +	b.hs	L(set_128)
 -	cmp	count, 96
 -	b.hi	L(set_long)
 -	cmp	count, 16
 -	b.hs	L(set_medium)
 -	mov	val, v0.D[0]
 +	/* Set 16..63 bytes.  */
 +	mov	off, 16
 +	and	off, off, count, lsr 1
 +	sub	dstend2, dstend, off
 +	str	q0, [dstin]
 +	str	q0, [dstin, off]
 +	str	q0, [dstend2, -16]
 +	str	q0, [dstend, -16]
 +	ret
 +	.p2align 4
 	/* Set 0..15 bytes.  */
 -	tbz	count, 3, 1f
 -	str	val, [dstin]
 -	str	val, [dstend, -8]
 -	ret
 -	nop
 -1:	tbz	count, 2, 2f
 -	str	valw, [dstin]
 -	str	valw, [dstend, -4]
 +L(set_small):
 +	add	dstend, dstin, count
 +	cmp	count, 4
 +	b.lo	2f
 +	lsr	off, count, 3
 +	sub	dstend2, dstend, off, lsl 2
 +	str	s0, [dstin]
 +	str	s0, [dstin, off, lsl 2]
 +	str	s0, [dstend2, -4]
 +	str	s0, [dstend, -4]
 	ret
 +
 +	/* Set 0..3 bytes.  */
 2:	cbz	count, 3f
 +	lsr	off, count, 1
 	strb	valw, [dstin]
 -	tbz	count, 1, 3f
 -	strh	valw, [dstend, -2]
 +	strb	valw, [dstin, off]
 +	strb	valw, [dstend, -1]
 3:	ret
 -	/* Set 17..96 bytes.  */
 -L(set_medium):
 -	str	q0, [dstin]
 -	tbnz	count, 6, L(set96)
 -	str	q0, [dstend, -16]
 -	tbz	count, 5, 1f
 -	str	q0, [dstin, 16]
 -	str	q0, [dstend, -32]
 -1:	ret
 -
 	.p2align 4
 -	/* Set 64..96 bytes.  Write 64 bytes from the start and
 -	   32 bytes from the end.  */
 -L(set96):
 -	str	q0, [dstin, 16]
 +L(set_128):
 +	bic	dst, dstin, 15
 +	cmp	count, 128
 +	b.hi	L(set_long)
 +	stp	q0, q0, [dstin]
 	stp	q0, q0, [dstin, 32]
 +	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 -	.p2align 3
 -	nop
 +	.p2align 4
 L(set_long):
 -	and	valw, valw, 255
 -	bic	dst, dstin, 15
 	str	q0, [dstin]
 -	cmp	count, 256
 -	ccmp	valw, 0, 0, cs
 -	b.eq	L(try_zva)
 -L(no_zva):
 -	sub	count, dstend, dst	/* Count is 16 too large.  */
 -	sub	dst, dst, 16		/* Dst is biased by -32.  */
 -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
 -1:	stp	q0, q0, [dst, 32]
 -	stp	q0, q0, [dst, 64]!
 -L(tail64):
 -	subs	count, count, 64
 -	b.hi	1b
 -2:	stp	q0, q0, [dstend, -64]
 +	str	q0, [dst, 16]
 +	tst	valw, 255
 +	b.ne	L(no_zva)
 +#ifndef ZVA64_ONLY
 +	mrs	zva_val, dczid_el0
 +	and	zva_val, zva_val, 31
 +	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 +	b.ne	L(zva_128)
 +#endif
 +	stp	q0, q0, [dst, 32]
 +	bic	dst, dstin, 63
 +	sub	count, dstend, dst	/* Count is now 64 too large.  */
 +	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
 +
 +	/* Write last bytes before ZVA loop.  */
 +	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 +
 +	.p2align 4
 +L(zva64_loop):
 +	add	dst, dst, 64
 +	dc	zva, dst
 +	subs	count, count, 64
 +	b.hi	L(zva64_loop)
 	ret
 -L(try_zva):
 -#ifndef ZVA64_ONLY
 	.p2align 3
 -	mrs	tmp1, dczid_el0
 -	tbnz	tmp1w, 4, L(no_zva)
 -	and	tmp1w, tmp1w, 15
 -	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
 -	b.ne	 L(zva_128)
 -	nop
 -#endif
 -	/* Write the first and last 64 byte aligned block using stp rather
 -	   than using DC ZVA.  This is faster on some cores.
 -	 */
 -	.p2align 4
 -L(zva_64):
 -	str	q0, [dst, 16]
 +L(no_zva):
 +	sub	count, dstend, dst	/* Count is 32 too large.  */
 +	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
 +L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
 -	bic	dst, dst, 63
 	stp	q0, q0, [dst, 64]
 -	stp	q0, q0, [dst, 96]
 -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
 -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
 -	add	dst, dst, 128
 -1:	dc	zva, dst
 	add	dst, dst, 64
 	subs	count, count, 64
 -	b.hi	1b
 -	stp	q0, q0, [dst, 0]
 -	stp	q0, q0, [dst, 32]
 +	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 #ifndef ZVA64_ONLY
 -	.p2align 3
 +	.p2align 4
 L(zva_128):
 -	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
 -	b.ne	L(zva_other)
 +	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
 +	b.ne	L(no_zva)
 -	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]
 	stp	q0, q0, [dst, 96]
 	bic	dst, dst, 127
 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
 -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
 -	add	dst, dst, 128
 -1:	dc	zva, dst
 -	add	dst, dst, 128
 +	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
 +1:	add	dst, dst, 128
 +	dc	zva, dst
 	subs	count, count, 128
 	b.hi	1b
 	stp	q0, q0, [dstend, -128]
@@ -156,35 +158,6 @@ L(zva_128):
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 -
 -L(zva_other):
 -	mov	tmp2w, 4
 -	lsl	zva_lenw, tmp2w, tmp1w
 -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
 -	cmp	count, tmp1
 -	blo	L(no_zva)
 -
 -	sub	tmp2, zva_len, 1
 -	add	tmp1, dst, zva_len
 -	add	dst, dst, 16
 -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
 -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
 -	beq	2f
 -1:	stp	q0, q0, [dst], 64
 -	stp	q0, q0, [dst, -32]
 -	subs	count, count, 64
 -	b.hi	1b
 -2:	mov	dst, tmp1
 -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
 -	subs	count, count, zva_len
 -	b.lo	4f
 -3:	dc	zva, dst
 -	add	dst, dst, zva_len
 -	subs	count, count, zva_len
 -	b.hs	3b
 -4:	add	count, count, zva_len
 -	sub	dst, dst, 32		/* Bias dst for tail loop.  */
 -	b	L(tail64)
 #endif
 END (MEMSET)
 -- 
 2.27.0
--- a/AArch64-Remove-zva_128-from-memset.patch
+++ b/AArch64-Remove-zva_128-from-memset.patch
@ -0,0 +1,65 @@
 From 5fe151d86a19bc3dc791fd2d92efeb6c6e11cf64 Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Mon, 25 Nov 2024 18:43:08 +0000
 Subject: [PATCH] AArch64: Remove zva_128 from memset
 Remove ZVA 128 support from memset - the new memset no longer
 guarantees count >= 256, which can result in underflow and a
 crash if ZVA size is 128 ([1]).  Since only one CPU uses a ZVA
 size of 128 and its memcpy implementation was removed in commit
 e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
 case too.
 [1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
 Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
 (cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
 ---
 sysdeps/aarch64/memset.S | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)
 diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
 index caafb019e2..71814d0b2f 100644
 --- a/sysdeps/aarch64/memset.S
 +++ b/sysdeps/aarch64/memset.S
@@ -104,7 +104,7 @@ L(set_long):
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 -	b.ne	L(zva_128)
 +	b.ne	L(no_zva)
 #endif
 	stp	q0, q0, [dst, 32]
 	bic	dst, dstin, 63
@@ -137,28 +137,5 @@ L(no_zva_loop):
 	stp	q0, q0, [dstend, -32]
 	ret
 -#ifndef ZVA64_ONLY
 -	.p2align 4
 -L(zva_128):
 -	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
 -	b.ne	L(no_zva)
 -
 -	stp	q0, q0, [dst, 32]
 -	stp	q0, q0, [dst, 64]
 -	stp	q0, q0, [dst, 96]
 -	bic	dst, dst, 127
 -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
 -	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
 -1:	add	dst, dst, 128
 -	dc	zva, dst
 -	subs	count, count, 128
 -	b.hi	1b
 -	stp	q0, q0, [dstend, -128]
 -	stp	q0, q0, [dstend, -96]
 -	stp	q0, q0, [dstend, -64]
 -	stp	q0, q0, [dstend, -32]
 -	ret
 -#endif
 -
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)
 -- 
 2.27.0
--- a/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
+++ b/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
@ -0,0 +1,29 @@
 From 097299ffa904b327fce83770fa6a522e4393ddb3 Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Thu, 27 Feb 2025 16:28:52 +0000
 Subject: [PATCH] AArch64: Use prefer_sve_ifuncs for SVE memset
 Use prefer_sve_ifuncs for SVE memset just like memcpy.
 Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
 (cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
 ---
 sysdeps/aarch64/multiarch/memset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
 index 89fde57f42..ce5d35a20e 100644
 --- a/sysdeps/aarch64/multiarch/memset.c
 +++ b/sysdeps/aarch64/multiarch/memset.c
@@ -49,7 +49,7 @@ select_memset_ifunc (void)
       if (IS_A64FX (midr) && zva_size == 256)
 	return __memset_a64fx;
 -      if (zva_size == 64)
 +      if (prefer_sve_ifuncs && zva_size == 64)
 	return __memset_sve_zva64;
     }
 -- 
 2.27.0
--- a/assert-Add-test-for-CVE-2025-0395.patch
+++ b/assert-Add-test-for-CVE-2025-0395.patch
@ -0,0 +1,132 @@
 From f984e2d7e8299726891a1a497a3c36cd5542a0bf Mon Sep 17 00:00:00 2001
 From: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Date: Fri, 31 Jan 2025 12:16:30 -0500
 Subject: [PATCH] assert: Add test for CVE-2025-0395
 Use the __progname symbol to override the program name to induce the
 failure that CVE-2025-0395 describes.
 This is related to BZ #32582
 Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 (cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
 ---
 assert/Makefile                  |  1 +
 assert/tst-assert-sa-2025-0001.c | 92 ++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 assert/tst-assert-sa-2025-0001.c
 diff --git a/assert/Makefile b/assert/Makefile
 index 67f4e6a570..b0fc9fc4d2 100644
 --- a/assert/Makefile
 +++ b/assert/Makefile
@@ -38,6 +38,7 @@ tests := \
   test-assert-perr \
   tst-assert-c++ \
   tst-assert-g++ \
 +  tst-assert-sa-2025-0001 \
   # tests
 ifeq ($(have-cxx-thread_local),yes)
 diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
 new file mode 100644
 index 0000000000..102cb0078d
 --- /dev/null
 +++ b/assert/tst-assert-sa-2025-0001.c
@@ -0,0 +1,92 @@
 +/* Test for CVE-2025-0395.
 +   Copyright The GNU Toolchain Authors.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +/* Test that a large enough __progname does not result in a buffer overflow
 +   when printing an assertion failure.  This was CVE-2025-0395.  */
 +#include <assert.h>
 +#include <inttypes.h>
 +#include <signal.h>
 +#include <stdbool.h>
 +#include <string.h>
 +#include <sys/mman.h>
 +#include <support/check.h>
 +#include <support/support.h>
 +#include <support/xstdio.h>
 +#include <support/xunistd.h>
 +
 +extern const char *__progname;
 +
 +int
 +do_test (int argc, char **argv)
 +{
 +
 +  support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
 +  ignore_stderr ();
 +
 +  /* XXX assumes that the assert is on a 2 digit line number.  */
 +  const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
 +
 +  int ret = fprintf (stderr, prompt, __FILE__);
 +  if (ret < 0)
 +    FAIL_EXIT1 ("fprintf failed: %m\n");
 +
 +  size_t pagesize = getpagesize ();
 +  size_t namesize = pagesize - 1 - ret;
 +
 +  /* Alter the progname so that the assert message fills the entire page.  */
 +  char progname[namesize];
 +  memset (progname, 'A', namesize - 1);
 +  progname[namesize - 1] = '\0';
 +  __progname = progname;
 +
 +  FILE *f = xfopen ("/proc/self/maps", "r");
 +  char *line = NULL;
 +  size_t len = 0;
 +  uintptr_t prev_to = 0;
 +
 +  /* Pad the beginning of every writable mapping with a PROT_NONE map.  This
 +     ensures that the mmap in the assert_fail path never ends up below a
 +     writable map and will terminate immediately in case of a buffer
 +     overflow.  */
 +  while (xgetline (&line, &len, f))
 +    {
 +      uintptr_t from, to;
 +      char perm[4];
 +
 +      sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
 +	      &from, &to,
 +	      &perm[0], &perm[1], &perm[2], &perm[3]);
 +
 +      bool writable = (memchr (perm, 'w', 4) != NULL);
 +
 +      if (prev_to != 0 && from - prev_to > pagesize && writable)
 +	xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
 +	       MAP_ANONYMOUS | MAP_PRIVATE, 0);
 +
 +      prev_to = to;
 +    }
 +
 +  xfclose (f);
 +
 +  assert (argc < 1);
 +  return 0;
 +}
 +
 +#define EXPECTED_SIGNAL SIGABRT
 +#define TEST_FUNCTION_ARGV do_test
 +#include <support/test-driver.c>
 -- 
 2.27.0
--- a/glibc.spec
+++ b/glibc.spec
@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	56
+Release: 	57
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@ -278,6 +278,14 @@ Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
 Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
 Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
 Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch 
 Patch192: assert-Add-test-for-CVE-2025-0395.patch
 Patch193: AArch64-Improve-generic-strlen.patch
 Patch194: AArch64-Optimize-memset.patch
 Patch195: AArch64-Remove-zva_128-from-memset.patch
 Patch196: math-Improve-layout-of-expf-data.patch
 Patch197: AArch64-Add-SVE-memset.patch
 Patch198: AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
 Patch199: math-Improve-layout-of-exp-exp10-data.patch
 #openEuler patch list
 Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -1502,6 +1510,16 @@ fi
 %endif
 %changelog
 * Fri Mar 28 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-57
 - math: Improve layout of exp/exp10 data
 - AArch64: Use prefer_sve_ifuncs for SVE memset
 - AArch64: Add SVE memset
 - math: Improve layout of expf data
 - AArch64: Remove zva_128 from memset
 - AArch64: Optimize memset
 - AArch64: Improve generic strlen
 - assert: Add test for CVE-2025-0395
 * Wed Mar 12 2025 xiajimei  <xiejiamei@hygon.cn> - 2.38-56
 - x86: Enable non-temporal memset for Hygon processors
 - x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS
--- a/math-Improve-layout-of-exp-exp10-data.patch
+++ b/math-Improve-layout-of-exp-exp10-data.patch
@ -0,0 +1,39 @@
 From 5a08d049dc5037e89eb95bb1506652f0043fa39e Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Fri, 13 Dec 2024 15:43:07 +0000
 Subject: [PATCH] math: Improve layout of exp/exp10 data
 GCC aligns global data to 16 bytes if their size is >= 16 bytes.  This patch
 changes the exp_data struct slightly so that the fields are better aligned
 and without gaps.  As a result on targets that support them, more load-pair
 instructions are used in exp.
 The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
 Neoverse V2.
 Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 (cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
 ---
 sysdeps/ieee754/dbl-64/math_config.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
 index 19af33fd86..52b720ecd1 100644
 --- a/sysdeps/ieee754/dbl-64/math_config.h
 +++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -195,10 +195,11 @@ check_uflow (double x)
 extern const struct exp_data
 {
   double invln2N;
 -  double shift;
   double negln2hiN;
   double negln2loN;
   double poly[4]; /* Last four coefficients.  */
 +  double shift;
 +
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 -- 
 2.27.0
--- a/math-Improve-layout-of-expf-data.patch
+++ b/math-Improve-layout-of-expf-data.patch
@ -0,0 +1,34 @@
 From 3de5112326a4274c97f154f3d335c11965ee960c Mon Sep 17 00:00:00 2001
 From: Wilco Dijkstra <wilco.dijkstra@arm.com>
 Date: Wed, 24 Jul 2024 15:17:47 +0100
 Subject: [PATCH] math: Improve layout of expf data
 GCC aligns global data to 16 bytes if their size is >= 16 bytes.  This patch
 changes the exp2f_data struct slightly so that the fields are better aligned.
 As a result on targets that support them, load-pair instructions accessing
 poly_scaled and invln2_scaled are now 16-byte aligned.
 Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 (cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
 ---
 sysdeps/ieee754/flt-32/math_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
 index d1b06a1a90..5904eb9bac 100644
 --- a/sysdeps/ieee754/flt-32/math_config.h
 +++ b/sysdeps/ieee754/flt-32/math_config.h
@@ -166,9 +166,9 @@ extern const struct exp2f_data
   uint64_t tab[1 << EXP2F_TABLE_BITS];
   double shift_scaled;
   double poly[EXP2F_POLY_ORDER];
 -  double shift;
   double invln2_scaled;
   double poly_scaled[EXP2F_POLY_ORDER];
 +  double shift;
 } __exp2f_data attribute_hidden;
 #define LOGF_TABLE_BITS 4
 -- 
 2.27.0