!1022 [sync] PR-1021: sync from glibc upstream 2.38 branch

From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
This commit is contained in:
openeuler-ci-bot 2025-03-28 12:24:20 +00:00 committed by Gitee
commit 5538373d14
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
9 changed files with 897 additions and 1 deletions

View File

@ -0,0 +1,200 @@
From 52c2b1556f773d9a75d030160e0e273a5ea84502 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue, 24 Dec 2024 18:01:59 +0000
Subject: [PATCH] AArch64: Add SVE memset
Add SVE memset based on the generic memset with predicated load for sizes < 16.
Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
stores for the last 64 bytes. Performance of random memset benchmark improves
by ~2% on Neoverse V1.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
---
sysdeps/aarch64/multiarch/Makefile | 1 +
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +-
sysdeps/aarch64/multiarch/memset.c | 4 +
sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 +++++++++++++++++++
4 files changed, 130 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index e4720b7468..214b6137b0 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -14,6 +14,7 @@ sysdep_routines += \
memset_generic \
memset_kunpeng \
memset_mops \
+ memset_sve_zva64 \
memset_zva64 \
strlen_asimd \
strlen_generic \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 73038ac810..2fa6baa319 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -56,7 +56,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
#if HAVE_AARCH64_SVE_ASM
- IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
+ IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
+ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
#endif
IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 6deb6865e5..89fde57f42 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
static inline __typeof (__redirect_memset) *
select_memset_ifunc (void)
@@ -47,6 +48,9 @@ select_memset_ifunc (void)
{
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
+
+ if (zva_size == 64)
+ return __memset_sve_zva64;
}
if (IS_KUNPENG920 (midr))
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
new file mode 100644
index 0000000000..7fb40fdd9e
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
@@ -0,0 +1,123 @@
+/* Optimized memset for SVE.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ * ZVA size is 64.
+ */
+
+#if HAVE_AARCH64_SVE_ASM
+
+.arch armv8.2-a+sve
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define vlen x5
+#define off x3
+#define dstend2 x5
+
+ENTRY (__memset_sve_zva64)
+ dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_16)
+
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+
+ .p2align 4
+L(set_16):
+ whilelo p0.b, xzr, count
+ st1b z0.b, p0, [dstin]
+ ret
+
+ .p2align 4
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ cmp count, 256
+ b.lo L(no_zva)
+ tst valw, 255
+ b.ne L(no_zva)
+
+ str q0, [dstin]
+ str q0, [dst, 16]
+ bic dst, dstin, 31
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
+ bic x8, x8, 15
+ stp q0, q0, [x8, -48]
+ str q0, [x8, -16]
+ str q0, [dstend, -16]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
+ ret
+
+L(no_zva):
+ str q0, [dstin]
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 16]
+ stp q0, q0, [dst, 48]
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (__memset_sve_zva64)
+#endif
--
2.27.0

View File

@ -0,0 +1,92 @@
From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed, 7 Aug 2024 14:43:47 +0100
Subject: [PATCH] AArch64: Improve generic strlen
Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
size computation to avoid increasing latency. On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
---
sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------
1 file changed, 27 insertions(+), 12 deletions(-)
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 133ef93342..352fb40d3a 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
+L(next16):
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
.p2align 5
L(loop):
- ldr data, [src, 16]
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
+ add src, src, 16
L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
- add result, result, 16
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (STRLEN)
--
2.27.0

View File

@ -0,0 +1,287 @@
From 95aa21432ccbf77225abd485d98df36ba760ff80 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 9 Sep 2024 15:26:47 +0100
Subject: [PATCH] AArch64: Optimize memset
Improve small memsets by avoiding branches and use overlapping stores.
Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes
other than 64 and 128. Performance of random memset benchmark improves by 24%
on Neoverse N1.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
---
sysdeps/aarch64/memset.S | 195 +++++++++++++++++----------------------
1 file changed, 84 insertions(+), 111 deletions(-)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index bbfb7184c3..caafb019e2 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "memset-reg.h"
#ifndef MEMSET
# define MEMSET memset
@@ -25,130 +25,132 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
-ENTRY (MEMSET)
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define off x3
+#define dstend2 x5
+ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_small)
+
add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
- cmp count, 96
- b.hi L(set_long)
- cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+ .p2align 4
/* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
- ret
- nop
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
+ str q0, [dst, 16]
+ tst valw, 255
+ b.ne L(no_zva)
+#ifndef ZVA64_ONLY
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+#endif
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
ret
-L(try_zva):
-#ifndef ZVA64_ONLY
.p2align 3
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
- nop
-#endif
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
- .p2align 4
-L(zva_64):
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- bic dst, dst, 63
stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
add dst, dst, 64
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
#ifndef ZVA64_ONLY
- .p2align 3
+ .p2align 4
L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
+ cmp zva_val, 5 /* ZVA size is 128 bytes. */
+ b.ne L(no_zva)
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
+ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
+1: add dst, dst, 128
+ dc zva, dst
subs count, count, 128
b.hi 1b
stp q0, q0, [dstend, -128]
@@ -156,35 +158,6 @@ L(zva_128):
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop. */
- b L(tail64)
#endif
END (MEMSET)
--
2.27.0

View File

@ -0,0 +1,65 @@
From 5fe151d86a19bc3dc791fd2d92efeb6c6e11cf64 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 25 Nov 2024 18:43:08 +0000
Subject: [PATCH] AArch64: Remove zva_128 from memset
Remove ZVA 128 support from memset - the new memset no longer
guarantees count >= 256, which can result in underflow and a
crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA
size of 128 and its memcpy implementation was removed in commit
e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
case too.
[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
---
sysdeps/aarch64/memset.S | 25 +------------------------
1 file changed, 1 insertion(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index caafb019e2..71814d0b2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -104,7 +104,7 @@ L(set_long):
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
+ b.ne L(no_zva)
#endif
stp q0, q0, [dst, 32]
bic dst, dstin, 63
@@ -137,28 +137,5 @@ L(no_zva_loop):
stp q0, q0, [dstend, -32]
ret
-#ifndef ZVA64_ONLY
- .p2align 4
-L(zva_128):
- cmp zva_val, 5 /* ZVA size is 128 bytes. */
- b.ne L(no_zva)
-
- stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128 + 128 /* Adjust count and bias for loop. */
-1: add dst, dst, 128
- dc zva, dst
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-#endif
-
END (MEMSET)
libc_hidden_builtin_def (MEMSET)
--
2.27.0

View File

@ -0,0 +1,29 @@
From 097299ffa904b327fce83770fa6a522e4393ddb3 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu, 27 Feb 2025 16:28:52 +0000
Subject: [PATCH] AArch64: Use prefer_sve_ifuncs for SVE memset
Use prefer_sve_ifuncs for SVE memset just like memcpy.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
---
sysdeps/aarch64/multiarch/memset.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 89fde57f42..ce5d35a20e 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -49,7 +49,7 @@ select_memset_ifunc (void)
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
- if (zva_size == 64)
+ if (prefer_sve_ifuncs && zva_size == 64)
return __memset_sve_zva64;
}
--
2.27.0

View File

@ -0,0 +1,132 @@
From f984e2d7e8299726891a1a497a3c36cd5542a0bf Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri, 31 Jan 2025 12:16:30 -0500
Subject: [PATCH] assert: Add test for CVE-2025-0395
Use the __progname symbol to override the program name to induce the
failure that CVE-2025-0395 describes.
This is related to BZ #32582
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
---
assert/Makefile | 1 +
assert/tst-assert-sa-2025-0001.c | 92 ++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+)
create mode 100644 assert/tst-assert-sa-2025-0001.c
diff --git a/assert/Makefile b/assert/Makefile
index 67f4e6a570..b0fc9fc4d2 100644
--- a/assert/Makefile
+++ b/assert/Makefile
@@ -38,6 +38,7 @@ tests := \
test-assert-perr \
tst-assert-c++ \
tst-assert-g++ \
+ tst-assert-sa-2025-0001 \
# tests
ifeq ($(have-cxx-thread_local),yes)
diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
new file mode 100644
index 0000000000..102cb0078d
--- /dev/null
+++ b/assert/tst-assert-sa-2025-0001.c
@@ -0,0 +1,92 @@
+/* Test for CVE-2025-0395.
+ Copyright The GNU Toolchain Authors.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Test that a large enough __progname does not result in a buffer overflow
+ when printing an assertion failure. This was CVE-2025-0395. */
+#include <assert.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xstdio.h>
+#include <support/xunistd.h>
+
+extern const char *__progname;
+
+int
+do_test (int argc, char **argv)
+{
+
+ support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
+ ignore_stderr ();
+
+ /* XXX assumes that the assert is on a 2 digit line number. */
+ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
+
+ int ret = fprintf (stderr, prompt, __FILE__);
+ if (ret < 0)
+ FAIL_EXIT1 ("fprintf failed: %m\n");
+
+ size_t pagesize = getpagesize ();
+ size_t namesize = pagesize - 1 - ret;
+
+ /* Alter the progname so that the assert message fills the entire page. */
+ char progname[namesize];
+ memset (progname, 'A', namesize - 1);
+ progname[namesize - 1] = '\0';
+ __progname = progname;
+
+ FILE *f = xfopen ("/proc/self/maps", "r");
+ char *line = NULL;
+ size_t len = 0;
+ uintptr_t prev_to = 0;
+
+ /* Pad the beginning of every writable mapping with a PROT_NONE map. This
+ ensures that the mmap in the assert_fail path never ends up below a
+ writable map and will terminate immediately in case of a buffer
+ overflow. */
+ while (xgetline (&line, &len, f))
+ {
+ uintptr_t from, to;
+ char perm[4];
+
+ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
+ &from, &to,
+ &perm[0], &perm[1], &perm[2], &perm[3]);
+
+ bool writable = (memchr (perm, 'w', 4) != NULL);
+
+ if (prev_to != 0 && from - prev_to > pagesize && writable)
+ xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+
+ prev_to = to;
+ }
+
+ xfclose (f);
+
+ assert (argc < 1);
+ return 0;
+}
+
+#define EXPECTED_SIGNAL SIGABRT
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
--
2.27.0

View File

@ -67,7 +67,7 @@
############################################################################## ##############################################################################
Name: glibc Name: glibc
Version: 2.38 Version: 2.38
Release: 56 Release: 57
Summary: The GNU libc libraries Summary: The GNU libc libraries
License: %{all_license} License: %{all_license}
URL: http://www.gnu.org/software/glibc/ URL: http://www.gnu.org/software/glibc/
@ -278,6 +278,14 @@ Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
Patch192: assert-Add-test-for-CVE-2025-0395.patch
Patch193: AArch64-Improve-generic-strlen.patch
Patch194: AArch64-Optimize-memset.patch
Patch195: AArch64-Remove-zva_128-from-memset.patch
Patch196: math-Improve-layout-of-expf-data.patch
Patch197: AArch64-Add-SVE-memset.patch
Patch198: AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
Patch199: math-Improve-layout-of-exp-exp10-data.patch
#openEuler patch list #openEuler patch list
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -1502,6 +1510,16 @@ fi
%endif %endif
%changelog %changelog
* Fri Mar 28 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-57
- math: Improve layout of exp/exp10 data
- AArch64: Use prefer_sve_ifuncs for SVE memset
- AArch64: Add SVE memset
- math: Improve layout of expf data
- AArch64: Remove zva_128 from memset
- AArch64: Optimize memset
- AArch64: Improve generic strlen
- assert: Add test for CVE-2025-0395
* Wed Mar 12 2025 xiajimei <xiejiamei@hygon.cn> - 2.38-56 * Wed Mar 12 2025 xiajimei <xiejiamei@hygon.cn> - 2.38-56
- x86: Enable non-temporal memset for Hygon processors - x86: Enable non-temporal memset for Hygon processors
- x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS - x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS

View File

@ -0,0 +1,39 @@
From 5a08d049dc5037e89eb95bb1506652f0043fa39e Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Fri, 13 Dec 2024 15:43:07 +0000
Subject: [PATCH] math: Improve layout of exp/exp10 data
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
changes the exp_data struct slightly so that the fields are better aligned
and without gaps. As a result on targets that support them, more load-pair
instructions are used in exp.
The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
Neoverse V2.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
---
sysdeps/ieee754/dbl-64/math_config.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 19af33fd86..52b720ecd1 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -195,10 +195,11 @@ check_uflow (double x)
extern const struct exp_data
{
double invln2N;
- double shift;
double negln2hiN;
double negln2loN;
double poly[4]; /* Last four coefficients. */
+ double shift;
+
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
--
2.27.0

View File

@ -0,0 +1,34 @@
From 3de5112326a4274c97f154f3d335c11965ee960c Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed, 24 Jul 2024 15:17:47 +0100
Subject: [PATCH] math: Improve layout of expf data
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
changes the exp2f_data struct slightly so that the fields are better aligned.
As a result on targets that support them, load-pair instructions accessing
poly_scaled and invln2_scaled are now 16-byte aligned.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
---
sysdeps/ieee754/flt-32/math_config.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
index d1b06a1a90..5904eb9bac 100644
--- a/sysdeps/ieee754/flt-32/math_config.h
+++ b/sysdeps/ieee754/flt-32/math_config.h
@@ -166,9 +166,9 @@ extern const struct exp2f_data
uint64_t tab[1 << EXP2F_TABLE_BITS];
double shift_scaled;
double poly[EXP2F_POLY_ORDER];
- double shift;
double invln2_scaled;
double poly_scaled[EXP2F_POLY_ORDER];
+ double shift;
} __exp2f_data attribute_hidden;
#define LOGF_TABLE_BITS 4
--
2.27.0