Compare commits

...

10 Commits

Author SHA1 Message Date
openeuler-ci-bot
5538373d14
!1022 [sync] PR-1021: sync from glibc upstream 2.38 branch
From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
2025-03-28 12:24:20 +00:00
Qingqing Li
b453407cdf sync from glibc upstream 2.38 branch.
below is the patch list:
- math: Improve layout of exp/exp10 data
- AArch64: Use prefer_sve_ifuncs for SVE memset
- AArch64: Add SVE memset
- math: Improve layout of expf data
- AArch64: Remove zva_128 from memset
- AArch64: Optimize memset
- AArch64: Improve generic strlen
- assert: Add test for CVE-2025-0395

(cherry picked from commit a6a6276229d415c277b108ed8e6ef4f2fe517bae)
2025-03-28 17:56:22 +08:00
openeuler-ci-bot
d9f212c1d8
!1014 [sync] PR-1012: x86: Add support for Hygon processors
From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
2025-03-13 06:59:26 +00:00
Xie jiamei
cb3e0b2e06 x86: Add support for Hygon processors
Signed-off-by: Xie jiamei <xiejiamei@hygon.cn>
(cherry picked from commit 9cf451dd6fdd13ec64780b1f56c84778f99449fb)
2025-03-13 14:58:58 +08:00
openeuler-ci-bot
a2a517c64a
!1010 [sync] PR-1006: elf: Keep using minimal malloc after early DTV resize
From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
2025-03-08 02:00:56 +00:00
shixuantong
e214ed3103 elf: Keep using minimal malloc after early DTV resize
(cherry picked from commit 9eae27b47fc8fcad542f939ee869f65e4405421c)
2025-03-08 09:59:14 +08:00
openeuler-ci-bot
2b4695acd8
!999 [sync] PR-995: glibc testcase use newly built ld.so instead of environment default installed ld.so
From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
2025-02-18 13:43:00 +00:00
shixuantong
b606cd617b glibc testcase use newly built ld.so instead of environment default installed ld.so
(cherry picked from commit 39be4915e4725de1e52ced4c6b8d0323703fc8c2)
2025-02-18 19:30:51 +08:00
openeuler-ci-bot
2f571cfc1d
!989 [sync] PR-986: fix CVE-2019-1010023
From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
2025-02-07 09:19:08 +00:00
shixuantong
9b958700fd fix CVE-2019-1010023
(cherry picked from commit 0e80112809f744dee46b79cb37b4e8b28f546962)
2025-02-07 14:05:33 +08:00
21 changed files with 2467 additions and 2 deletions

View File

@ -0,0 +1,200 @@
From 52c2b1556f773d9a75d030160e0e273a5ea84502 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue, 24 Dec 2024 18:01:59 +0000
Subject: [PATCH] AArch64: Add SVE memset
Add SVE memset based on the generic memset with predicated load for sizes < 16.
Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
stores for the last 64 bytes. Performance of random memset benchmark improves
by ~2% on Neoverse V1.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
---
sysdeps/aarch64/multiarch/Makefile | 1 +
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +-
sysdeps/aarch64/multiarch/memset.c | 4 +
sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 +++++++++++++++++++
4 files changed, 130 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index e4720b7468..214b6137b0 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -14,6 +14,7 @@ sysdep_routines += \
memset_generic \
memset_kunpeng \
memset_mops \
+ memset_sve_zva64 \
memset_zva64 \
strlen_asimd \
strlen_generic \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 73038ac810..2fa6baa319 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -56,7 +56,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
#if HAVE_AARCH64_SVE_ASM
- IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
+ IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
+ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
#endif
IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 6deb6865e5..89fde57f42 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
static inline __typeof (__redirect_memset) *
select_memset_ifunc (void)
@@ -47,6 +48,9 @@ select_memset_ifunc (void)
{
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
+
+ if (zva_size == 64)
+ return __memset_sve_zva64;
}
if (IS_KUNPENG920 (midr))
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
new file mode 100644
index 0000000000..7fb40fdd9e
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
@@ -0,0 +1,123 @@
+/* Optimized memset for SVE.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ * ZVA size is 64.
+ */
+
+#if HAVE_AARCH64_SVE_ASM
+
+.arch armv8.2-a+sve
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define vlen x5
+#define off x3
+#define dstend2 x5
+
+ENTRY (__memset_sve_zva64)
+ dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_16)
+
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+
+ .p2align 4
+L(set_16):
+ whilelo p0.b, xzr, count
+ st1b z0.b, p0, [dstin]
+ ret
+
+ .p2align 4
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ cmp count, 256
+ b.lo L(no_zva)
+ tst valw, 255
+ b.ne L(no_zva)
+
+ str q0, [dstin]
+ str q0, [dst, 16]
+ bic dst, dstin, 31
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
+ bic x8, x8, 15
+ stp q0, q0, [x8, -48]
+ str q0, [x8, -16]
+ str q0, [dstend, -16]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
+ ret
+
+L(no_zva):
+ str q0, [dstin]
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 16]
+ stp q0, q0, [dst, 48]
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (__memset_sve_zva64)
+#endif
--
2.27.0

View File

@ -0,0 +1,92 @@
From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed, 7 Aug 2024 14:43:47 +0100
Subject: [PATCH] AArch64: Improve generic strlen
Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
size computation to avoid increasing latency. On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
---
sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------
1 file changed, 27 insertions(+), 12 deletions(-)
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 133ef93342..352fb40d3a 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
+L(next16):
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
.p2align 5
L(loop):
- ldr data, [src, 16]
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
+ add src, src, 16
L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
- add result, result, 16
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (STRLEN)
--
2.27.0

View File

@ -0,0 +1,287 @@
From 95aa21432ccbf77225abd485d98df36ba760ff80 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 9 Sep 2024 15:26:47 +0100
Subject: [PATCH] AArch64: Optimize memset
Improve small memsets by avoiding branches and use overlapping stores.
Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes
other than 64 and 128. Performance of random memset benchmark improves by 24%
on Neoverse N1.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
---
sysdeps/aarch64/memset.S | 195 +++++++++++++++++----------------------
1 file changed, 84 insertions(+), 111 deletions(-)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index bbfb7184c3..caafb019e2 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "memset-reg.h"
#ifndef MEMSET
# define MEMSET memset
@@ -25,130 +25,132 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
-ENTRY (MEMSET)
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define off x3
+#define dstend2 x5
+ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_small)
+
add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
- cmp count, 96
- b.hi L(set_long)
- cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+ .p2align 4
/* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
- ret
- nop
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
+ str q0, [dst, 16]
+ tst valw, 255
+ b.ne L(no_zva)
+#ifndef ZVA64_ONLY
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+#endif
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
ret
-L(try_zva):
-#ifndef ZVA64_ONLY
.p2align 3
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
- nop
-#endif
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
- .p2align 4
-L(zva_64):
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- bic dst, dst, 63
stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
add dst, dst, 64
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
#ifndef ZVA64_ONLY
- .p2align 3
+ .p2align 4
L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
+ cmp zva_val, 5 /* ZVA size is 128 bytes. */
+ b.ne L(no_zva)
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
+ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
+1: add dst, dst, 128
+ dc zva, dst
subs count, count, 128
b.hi 1b
stp q0, q0, [dstend, -128]
@@ -156,35 +158,6 @@ L(zva_128):
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop. */
- b L(tail64)
#endif
END (MEMSET)
--
2.27.0

View File

@ -0,0 +1,65 @@
From 5fe151d86a19bc3dc791fd2d92efeb6c6e11cf64 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon, 25 Nov 2024 18:43:08 +0000
Subject: [PATCH] AArch64: Remove zva_128 from memset
Remove ZVA 128 support from memset - the new memset no longer
guarantees count >= 256, which can result in underflow and a
crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA
size of 128 and its memcpy implementation was removed in commit
e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
case too.
[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
---
sysdeps/aarch64/memset.S | 25 +------------------------
1 file changed, 1 insertion(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index caafb019e2..71814d0b2f 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -104,7 +104,7 @@ L(set_long):
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
+ b.ne L(no_zva)
#endif
stp q0, q0, [dst, 32]
bic dst, dstin, 63
@@ -137,28 +137,5 @@ L(no_zva_loop):
stp q0, q0, [dstend, -32]
ret
-#ifndef ZVA64_ONLY
- .p2align 4
-L(zva_128):
- cmp zva_val, 5 /* ZVA size is 128 bytes. */
- b.ne L(no_zva)
-
- stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128 + 128 /* Adjust count and bias for loop. */
-1: add dst, dst, 128
- dc zva, dst
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-#endif
-
END (MEMSET)
libc_hidden_builtin_def (MEMSET)
--
2.27.0

View File

@ -0,0 +1,29 @@
From 097299ffa904b327fce83770fa6a522e4393ddb3 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu, 27 Feb 2025 16:28:52 +0000
Subject: [PATCH] AArch64: Use prefer_sve_ifuncs for SVE memset
Use prefer_sve_ifuncs for SVE memset just like memcpy.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
---
sysdeps/aarch64/multiarch/memset.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 89fde57f42..ce5d35a20e 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -49,7 +49,7 @@ select_memset_ifunc (void)
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
- if (zva_size == 64)
+ if (prefer_sve_ifuncs && zva_size == 64)
return __memset_sve_zva64;
}
--
2.27.0

View File

@ -0,0 +1,132 @@
From f984e2d7e8299726891a1a497a3c36cd5542a0bf Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri, 31 Jan 2025 12:16:30 -0500
Subject: [PATCH] assert: Add test for CVE-2025-0395
Use the __progname symbol to override the program name to induce the
failure that CVE-2025-0395 describes.
This is related to BZ #32582
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
---
assert/Makefile | 1 +
assert/tst-assert-sa-2025-0001.c | 92 ++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+)
create mode 100644 assert/tst-assert-sa-2025-0001.c
diff --git a/assert/Makefile b/assert/Makefile
index 67f4e6a570..b0fc9fc4d2 100644
--- a/assert/Makefile
+++ b/assert/Makefile
@@ -38,6 +38,7 @@ tests := \
test-assert-perr \
tst-assert-c++ \
tst-assert-g++ \
+ tst-assert-sa-2025-0001 \
# tests
ifeq ($(have-cxx-thread_local),yes)
diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
new file mode 100644
index 0000000000..102cb0078d
--- /dev/null
+++ b/assert/tst-assert-sa-2025-0001.c
@@ -0,0 +1,92 @@
+/* Test for CVE-2025-0395.
+ Copyright The GNU Toolchain Authors.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Test that a large enough __progname does not result in a buffer overflow
+ when printing an assertion failure. This was CVE-2025-0395. */
+#include <assert.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xstdio.h>
+#include <support/xunistd.h>
+
+extern const char *__progname;
+
+int
+do_test (int argc, char **argv)
+{
+
+ support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
+ ignore_stderr ();
+
+ /* XXX assumes that the assert is on a 2 digit line number. */
+ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
+
+ int ret = fprintf (stderr, prompt, __FILE__);
+ if (ret < 0)
+ FAIL_EXIT1 ("fprintf failed: %m\n");
+
+ size_t pagesize = getpagesize ();
+ size_t namesize = pagesize - 1 - ret;
+
+ /* Alter the progname so that the assert message fills the entire page. */
+ char progname[namesize];
+ memset (progname, 'A', namesize - 1);
+ progname[namesize - 1] = '\0';
+ __progname = progname;
+
+ FILE *f = xfopen ("/proc/self/maps", "r");
+ char *line = NULL;
+ size_t len = 0;
+ uintptr_t prev_to = 0;
+
+ /* Pad the beginning of every writable mapping with a PROT_NONE map. This
+ ensures that the mmap in the assert_fail path never ends up below a
+ writable map and will terminate immediately in case of a buffer
+ overflow. */
+ while (xgetline (&line, &len, f))
+ {
+ uintptr_t from, to;
+ char perm[4];
+
+ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
+ &from, &to,
+ &perm[0], &perm[1], &perm[2], &perm[3]);
+
+ bool writable = (memchr (perm, 'w', 4) != NULL);
+
+ if (prev_to != 0 && from - prev_to > pagesize && writable)
+ xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+
+ prev_to = to;
+ }
+
+ xfclose (f);
+
+ assert (argc < 1);
+ return 0;
+}
+
+#define EXPECTED_SIGNAL SIGABRT
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
--
2.27.0

View File

@ -0,0 +1,210 @@
From 17f7ca193d60fefd6cc5e48aacd1ce9f7dd29862 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 14 Aug 2024 14:37:31 +0800
Subject: [PATCH 09/10] x86: Add `Avoid_STOSB` tunable to allow NT memset
without ERMS
The goal of this flag is to allow targets which don't prefer/have ERMS
to still access the non-temporal memset implementation.
There are 4 cases for tuning memset:
1) `Avoid_STOSB && Avoid_Non_Temporal_Memset`
- Memset with temporal stores
2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset`
- Memset with temporal/non-temporal stores. Non-temporal path
goes through `rep stosb` path. We accomplish this by setting
`x86_rep_stosb_threshold` to
`x86_memset_non_temporal_threshold`.
3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset`
- Memset with temporal stores/`rep stosb`
3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset`
- Memset with temporal stores/`rep stosb`/non-temporal stores.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 4 +++
sysdeps/x86/cpu-tunables.c | 2 ++
sysdeps/x86/dl-cacheinfo.h | 34 ++++++++++++++++---
...cpu-features-preferred_feature_index_1.def | 1 +
sysdeps/x86/tst-hwcap-tunables.c | 6 ++--
sysdeps/x86_64/multiarch/ifunc-memset.h | 18 +++++++---
6 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c9f2297524..287edc5b08 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1014,6 +1014,10 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
cpu_features->preferred[index_arch_I686] |= bit_arch_I686;
+ /* No ERMS, we want to avoid stosb for memset. */
+ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;
+
#if !HAS_CPUID
no_cpuid:
#endif
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index b8475730ea..a4bbf13080 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -214,6 +214,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Prefer_FSRM,
disable, 11);
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Avoid_STOSB,
+ disable, 11);
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features,
Slow_SSE4_2,
SSE4_2,
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index d8288f0b0c..5803bfcea8 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1096,18 +1096,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
long int, NULL);
+ /*
+ For memset, the non-temporal implementation is only accessed through the
+ stosb code. ie:
+ ```
+ if (size >= rep_stosb_thresh)
+ {
+ if (size >= non_temporal_thresh)
+ {
+ do_non_temporal ();
+ }
+ do_stosb ();
+ }
+ do_normal_vec_loop ();
+ ```
+ So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
+ to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
+ `rep stosb` will never be used.
+ */
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+ memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, SIZE_MAX);
+ /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the
+ final value of `x86_memset_non_temporal_threshold`. In some cases this can
+ be a matter of correctness. */
+ if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB))
+ rep_stosb_threshold
+ = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+ SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
- TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
- memset_non_temporal_threshold,
- minimum_non_temporal_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
- TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
- SIZE_MAX);
unsigned long int rep_movsb_stop_threshold;
/* Setting the upper bound of ERMS to the computed value of
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index aae1c85551..38a0c9226c 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
BIT (Avoid_Non_Temporal_Memset)
+BIT (Avoid_STOSB)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 94307283d7..1920f5057e 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,8 @@ static const struct test_t
/* Disable everything. */
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
- "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
+ "-Avoid_STOSB",
test_1,
array_length (test_1)
},
@@ -68,7 +69,8 @@ static const struct test_t
/* Same as before, but with some empty suboptions. */
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
- "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
+ "-Avoid_STOSB,-,",
test_1,
array_length (test_1)
}
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 5c5096ec5a..6b3b9a17a2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
+static inline int
+prefer_erms_nt_impl (const struct cpu_features *cpu_features)
+{
+ return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
+}
+
static inline void *
IFUNC_SELECTOR (void)
{
@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (avx512_unaligned_erms);
return OPTIMIZE (avx512_unaligned);
@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (evex_unaligned_erms);
return OPTIMIZE (evex_unaligned);
@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (avx2_unaligned_erms_rtm);
return OPTIMIZE (avx2_unaligned_rtm);
@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
Prefer_No_VZEROUPPER, !))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (prefer_erms_nt_impl (cpu_features))
return OPTIMIZE (avx2_unaligned_erms);
return OPTIMIZE (avx2_unaligned);
}
}
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
+ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
return OPTIMIZE (sse2_unaligned_erms);
return OPTIMIZE (sse2_unaligned);
--
2.17.1

View File

@ -0,0 +1,95 @@
From 01b5cac929a3be361dd575bed6673c40a25a6d61 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 14 Aug 2024 14:37:30 +0800
Subject: [PATCH 08/10] x86: Use `Avoid_Non_Temporal_Memset` to control
non-temporal path
This is just a refactor and there should be no behavioral change from
this commit.
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
for controlling whether we use non-temporal memset rather than having
extra logic based on vendor.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
2 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index b4030776a7..c9f2297524 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+ as of writing this, we only have benchmarks indicatings it profitability
+ on Intel/AMD. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features)
update_active (cpu_features);
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (family == 0x06)
{
model += extended_model;
@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
{
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index de4584116f..d8288f0b0c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,14 +1048,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores are more performant on Intel and AMD hardware above
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
- && (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd))
- memset_non_temporal_threshold = non_temporal_threshold;
-
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1077,6 +1069,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (tunable_size != 0)
shared = tunable_size;
+ /* Non-temporal stores are more performant on some hardware above
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+ Intel and AMD hardware. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
if (tunable_size > minimum_non_temporal_threshold
&& tunable_size <= maximum_non_temporal_threshold)
--
2.17.1

View File

@ -0,0 +1,192 @@
From aa3d7bd5299b33bffc118aa618b59bfa66059bcb Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Thu, 13 Feb 2025 21:56:52 +0100
Subject: [PATCH] elf: Keep using minimal malloc after early DTV resize (bug
32412)
If an auditor loads many TLS-using modules during startup, it is
possible to trigger DTV resizing. Previously, the DTV was marked
as allocated by the main malloc afterwards, even if the minimal
malloc was still in use. With this change, _dl_resize_dtv marks
the resized DTV as allocated with the minimal malloc.
The new test reuses TLS-using modules from other auditing tests.
Reviewed-by: DJ Delorie <dj@redhat.com>
---
elf/Makefile | 5 +++
elf/dl-tls.c | 7 ++++
elf/tst-audit-tlsdesc-dlopen2.c | 46 +++++++++++++++++++++++++
elf/tst-auditmod-tlsdesc2.c | 59 +++++++++++++++++++++++++++++++++
4 files changed, 117 insertions(+)
create mode 100644 elf/tst-audit-tlsdesc-dlopen2.c
create mode 100644 elf/tst-auditmod-tlsdesc2.c
diff --git a/elf/Makefile b/elf/Makefile
index 5c833871d0..1ea0e7037e 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -379,6 +379,7 @@ tests += \
tst-align3 \
tst-audit-tlsdesc \
tst-audit-tlsdesc-dlopen \
+ tst-audit-tlsdesc-dlopen2 \
tst-audit1 \
tst-audit2 \
tst-audit8 \
@@ -863,6 +864,7 @@ modules-names += \
tst-auditmanymod8 \
tst-auditmanymod9 \
tst-auditmod-tlsdesc \
+ tst-auditmod-tlsdesc2 \
tst-auditmod1 \
tst-auditmod11 \
tst-auditmod12 \
@@ -3189,6 +3191,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
+ $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
$(objpfx)tst-dlmopen-twice.out: \
$(objpfx)tst-dlmopen-twice-mod1.so \
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 5178d9b66a..a083a82933 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -566,6 +566,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
if (newp == NULL)
oom ();
memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
+#ifdef SHARED
+ /* Auditors can trigger a DTV resize event while the full malloc
+ is not yet in use. Mark the new DTV allocation as the
+ initial allocation. */
+ if (!__rtld_malloc_is_complete ())
+ GL(dl_initial_dtv) = &newp[1];
+#endif
}
else
{
diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
new file mode 100644
index 0000000000..7ba2c4129a
--- /dev/null
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
@@ -0,0 +1,46 @@
+/* Loading TLS-using modules from auditors (bug 32412). Main program.
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+
+static int
+do_test (void)
+{
+ puts ("info: start of main program");
+
+ /* Load TLS-using modules, to trigger DTV resizing. The dynamic
+ linker will load them again (requiring their own TLS) because the
+ dlopen calls from the auditor were in the auditing namespace. */
+ for (int i = 1; i <= 19; ++i)
+ {
+ char dso[30];
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+ char sym[30];
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+ void *handle = xdlopen (dso, RTLD_LAZY);
+ int (*func) (void) = xdlsym (handle, sym);
+ /* Trigger TLS allocation. */
+ func ();
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
new file mode 100644
index 0000000000..50275cd34d
--- /dev/null
+++ b/elf/tst-auditmod-tlsdesc2.c
@@ -0,0 +1,59 @@
+/* Loading TLS-using modules from auditors (bug 32412). Audit module.
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+unsigned int
+la_version (unsigned int version)
+{
+ /* Open some modules, to trigger DTV resizing before the switch to
+ the main malloc. */
+ for (int i = 1; i <= 19; ++i)
+ {
+ char dso[30];
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+ char sym[30];
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+ void *handle = dlopen (dso, RTLD_LAZY);
+ if (handle == NULL)
+ {
+ printf ("error: dlmopen from auditor: %s\n", dlerror ());
+ fflush (stdout);
+ _exit (1);
+ }
+ int (*func) (void) = dlsym (handle, sym);
+ if (func == NULL)
+ {
+ printf ("error: dlsym from auditor: %s\n", dlerror ());
+ fflush (stdout);
+ _exit (1);
+ }
+ /* Trigger TLS allocation. */
+ func ();
+ }
+
+ puts ("info: TLS-using modules loaded from auditor");
+ fflush (stdout);
+
+ return LAV_CURRENT;
+}
--
2.27.0

View File

@ -0,0 +1,97 @@
From daa15a5bffc436cf7b943b306c85c90ce8bb369e Mon Sep 17 00:00:00 2001
From: Feifei Wang <wangfeifei@hygon.cn>
Date: Mon, 19 Aug 2024 14:57:54 +0800
Subject: [PATCH 02/10] x86: Add cache information support for Hygon processors
Add hygon branch in dl_init_cacheinfo function to initialize
cache size variables for hygon processors. In the meanwhile,
add handle_hygon() function to get cache information.
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 60 ++++++++++++++++++++++++++++++++++++++
1 file changed, 60 insertions(+)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 7b5ed210ca..85c404dd26 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -567,6 +567,48 @@ handle_zhaoxin (int name)
return 0;
}
+static long int __attribute__ ((noinline))
+handle_hygon (int name)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ unsigned int count = 0x1;
+
+ if (name >= _SC_LEVEL3_CACHE_SIZE)
+ count = 0x3;
+ else if (name >= _SC_LEVEL2_CACHE_SIZE)
+ count = 0x2;
+ else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+ count = 0x0;
+
+ /* Use __cpuid__ '0x8000_001D' to compute cache details. */
+ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+ switch (name)
+ {
+ case _SC_LEVEL1_ICACHE_ASSOC:
+ case _SC_LEVEL1_DCACHE_ASSOC:
+ case _SC_LEVEL2_CACHE_ASSOC:
+ case _SC_LEVEL3_CACHE_ASSOC:
+ return ((ebx >> 22) & 0x3ff) + 1;
+ case _SC_LEVEL1_ICACHE_LINESIZE:
+ case _SC_LEVEL1_DCACHE_LINESIZE:
+ case _SC_LEVEL2_CACHE_LINESIZE:
+ case _SC_LEVEL3_CACHE_LINESIZE:
+ return (ebx & 0xfff) + 1;
+ case _SC_LEVEL1_ICACHE_SIZE:
+ case _SC_LEVEL1_DCACHE_SIZE:
+ case _SC_LEVEL2_CACHE_SIZE:
+ case _SC_LEVEL3_CACHE_SIZE:
+ return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+ default:
+ __builtin_unreachable ();
+ }
+ return -1;
+}
+
static void
get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
long int core)
@@ -890,6 +932,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
shared_per_thread = shared;
}
+ else if (cpu_features->basic.kind == arch_kind_hygon)
+ {
+ data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE);
+ shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE);
+ shared_per_thread = shared;
+
+ level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE);
+ level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE);
+ level1_dcache_size = data;
+ level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC);
+ level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE);
+ level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);;
+ level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC);
+ level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE);
+ level3_cache_size = shared;
+ level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC);
+ level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE);
+ }
cpu_features->level1_icache_size = level1_icache_size;
cpu_features->level1_icache_linesize = level1_icache_linesize;
--
2.17.1

View File

@ -0,0 +1,69 @@
From 3215d6157f5f94706aa5db6783838885a8a3c4f1 Mon Sep 17 00:00:00 2001
From: Feifei Wang <wangfeifei@hygon.cn>
Date: Mon, 19 Aug 2024 14:57:53 +0800
Subject: [PATCH 01/10] x86: Add new architecture type for Hygon processors
Add a new architecture type arch_kind_hygon to spilt Hygon branch
from AMD. This is to facilitate the Hygon processors to make settings
that are suitable for its own characteristics.
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 19 ++++++++++++++++---
sysdeps/x86/include/cpu-features.h | 1 +
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index f752ebd24d..c4dd85145e 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -851,9 +851,8 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
|= bit_arch_Avoid_Short_Distance_REP_MOVSB;
}
- /* This spells out "AuthenticAMD" or "HygonGenuine". */
- else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
- || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
+ /* This spells out "AuthenticAMD". */
+ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
{
unsigned int extended_model;
@@ -963,6 +962,20 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
}
}
}
+ /* This spells out "HygonGenuine". */
+ else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
+ {
+ unsigned int extended_model;
+
+ kind = arch_kind_hygon;
+
+ get_common_indices (cpu_features, &family, &model, &extended_model,
+ &stepping);
+
+ get_extended_indices (cpu_features);
+
+ update_active (cpu_features);
+ }
else
{
kind = arch_kind_other;
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index eb30d342a6..594feeb2f4 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -856,6 +856,7 @@ enum cpu_features_kind
arch_kind_intel,
arch_kind_amd,
arch_kind_zhaoxin,
+ arch_kind_hygon,
arch_kind_other
};
--
2.17.1

View File

@ -0,0 +1,211 @@
From 4ad2c9d04b76d7c4a42d80a82c022cd60b43b8b2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 24 May 2024 12:38:51 -0500
Subject: [PATCH 04/10] x86: Add seperate non-temporal tunable for memset
The tuning for non-temporal stores for memset vs memcpy is not always
the same. This includes both the exact value and whether non-temporal
stores are profitable at all for a given arch.
This patch add `x86_memset_non_temporal_threshold`. Currently we
disable non-temporal stores for non Intel vendors as the only
benchmarks showing its benefit have been on Intel hardware.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
manual/tunables.texi | 16 +++++++++++++++-
sysdeps/x86/cacheinfo.h | 8 +++++++-
sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
sysdeps/x86/dl-tunables.list | 3 +++
sysdeps/x86/include/cpu-features.h | 4 +++-
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
7 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 6493904bae..2a2877884c 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.cpu.x86_shstk:
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages.
@cindex shared_cache_size tunables
@cindex tunables, shared_cache_size
@cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
@deftp {Tunable namespace} glibc.cpu
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -562,6 +564,18 @@ like memmove and memcpy.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
set threshold in bytes to start using "rep movsb". The value must be
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index ec1bc142c4..fd2b2ae66b 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
-/* Threshold to use non temporal store. */
+/* Threshold to use non temporal store in memmove. */
long int __x86_shared_non_temporal_threshold attribute_hidden;
+/* Threshold to use non temporal store in memset. */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
/* Threshold to use Enhanced REP MOVSB. */
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
@@ -77,6 +80,9 @@ init_cacheinfo (void)
__x86_shared_non_temporal_threshold
= cpu_features->non_temporal_threshold;
+ __x86_memset_non_temporal_threshold
+ = cpu_features->memset_non_temporal_threshold;
+
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index ce2e6927e4..9f27da21ce 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,6 +1048,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
+ Until we benchmark data on other x86 processor, disable non-temporal
+ stores in memset. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (cpu_features->basic.kind == arch_kind_intel)
+ memset_non_temporal_threshold = non_temporal_threshold;
+
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1074,6 +1081,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
&& tunable_size <= maximum_non_temporal_threshold)
non_temporal_threshold = tunable_size;
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ if (tunable_size > minimum_non_temporal_threshold
+ && tunable_size <= maximum_non_temporal_threshold)
+ memset_non_temporal_threshold = tunable_size;
+
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
if (tunable_size > minimum_rep_movsb_threshold)
rep_movsb_threshold = tunable_size;
@@ -1089,6 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1102,6 +1117,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
cpu_features->non_temporal_threshold = non_temporal_threshold;
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 5aab63e532..05d54b5eba 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void)
cpu_features->shared_cache_size);
print_cpu_features_value ("non_temporal_threshold",
cpu_features->non_temporal_threshold);
+ print_cpu_features_value ("memset_non_temporal_threshold",
+ cpu_features->memset_non_temporal_threshold);
print_cpu_features_value ("rep_movsb_threshold",
cpu_features->rep_movsb_threshold);
print_cpu_features_value ("rep_movsb_stop_threshold",
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index d1442d88ba..53852d6a07 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,9 @@ glibc {
x86_non_temporal_threshold {
type: SIZE_T
}
+ x86_memset_non_temporal_threshold {
+ type: SIZE_T
+ }
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 594feeb2f4..e2d641dcd0 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -918,8 +918,10 @@ struct cpu_features
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
unsigned long int shared_cache_size;
- /* Threshold to use non temporal store. */
+ /* Threshold to use non temporal store in memmove. */
unsigned long int non_temporal_threshold;
+ /* Threshold to use non temporal store in memset. */
+ unsigned long int memset_non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to stop using "rep movsb". */
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index aba45e3da0..d95750b516 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -24,9 +24,9 @@
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done.
6. On machines ERMS feature, if size is range
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
then REP STOSB will be used.
- 7. If size >= __x86_shared_non_temporal_threshold, use a
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
non-temporal stores. */
#include <sysdep.h>
@@ -318,7 +318,7 @@ L(return_vzeroupper):
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
range for 2-byte jump encoding. */
L(stosb_local):
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
jae L(nt_memset)
movzbl %sil, %eax
mov %RDX_LP, %RCX_LP
--
2.17.1

View File

@ -0,0 +1,263 @@
From ce7c6c491ed0750a10f9a52b5edc710d978e70e2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 15 Jul 2024 16:19:17 +0800
Subject: [PATCH 07/10] x86: Disable non-temporal memset on Skylake Server
The original commit enabling non-temporal memset on Skylake Server had
erroneous benchmarks (actually done on ICX).
Further benchmarks indicate non-temporal stores may in fact by a
regression on Skylake Server.
This commit may be over-cautious in some cases, but should avoid any
regressions for 2.40.
Tested using qemu on all x86_64 cpu arch supported by both qemu +
GLIBC.
Reviewed-by: DJ Delorie <dj@redhat.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 13 +-
sysdeps/x86/cpu-tunables.c | 6 +
sysdeps/x86/dl-cacheinfo.h | 15 +-
...cpu-features-preferred_feature_index_1.def | 1 +
sysdeps/x86/tst-hwcap-tunables.c | 148 ++++++++++++++++++
5 files changed, 173 insertions(+), 10 deletions(-)
create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c4dd85145e..b4030776a7 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_COMETLAKE:
case INTEL_BIGCORE_SKYLAKE_AVX512:
case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due its it unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_ICELAKE:
case INTEL_BIGCORE_TIGERLAKE:
case INTEL_BIGCORE_ROCKETLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 0d4f328585..b8475730ea 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -272,6 +272,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
disable, 24);
}
break;
+ case 25:
+ {
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ Avoid_Non_Temporal_Memset,
+ disable, 25);
+ }
case 26:
{
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a76df092e6..de4584116f 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1051,13 +1051,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* Non-temporal stores are more performant on Intel and AMD hardware above
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd)
- memset_non_temporal_threshold = non_temporal_threshold;
-
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
- cases slower than the vectorized path (and for some alignments,
- it is really slow, check BZ #30994). */
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+ && (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
if (cpu_features->basic.kind == arch_kind_amd)
rep_movsb_threshold = non_temporal_threshold;
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index d20c5b3196..aae1c85551 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
new file mode 100644
index 0000000000..94307283d7
--- /dev/null
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -0,0 +1,148 @@
+/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
+ Copyright (C) 2023-2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <getopt.h>
+#include <ifunc-impl-list.h>
+#include <spawn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <intprops.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xunistd.h>
+#include <support/capture_subprocess.h>
+
+/* Nonzero if the program gets called via `exec'. */
+#define CMDLINE_OPTIONS \
+ { "restart", no_argument, &restart, 1 },
+static int restart;
+
+/* Disable everything. */
+static const char *test_1[] =
+{
+ "__memcpy_avx512_no_vzeroupper",
+ "__memcpy_avx512_unaligned",
+ "__memcpy_avx512_unaligned_erms",
+ "__memcpy_evex_unaligned",
+ "__memcpy_evex_unaligned_erms",
+ "__memcpy_avx_unaligned",
+ "__memcpy_avx_unaligned_erms",
+ "__memcpy_avx_unaligned_rtm",
+ "__memcpy_avx_unaligned_erms_rtm",
+ "__memcpy_ssse3",
+};
+
+static const struct test_t
+{
+ const char *env;
+ const char *const *funcs;
+ size_t nfuncs;
+} tests[] =
+{
+ {
+ /* Disable everything. */
+ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+ test_1,
+ array_length (test_1)
+ },
+ {
+ /* Same as before, but with some empty suboptions. */
+ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+ test_1,
+ array_length (test_1)
+ }
+};
+
+/* Called on process re-execution. */
+_Noreturn static void
+handle_restart (int ntest)
+{
+ struct libc_ifunc_impl impls[32];
+ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
+ if (cnt == 0)
+ _exit (EXIT_SUCCESS);
+ TEST_VERIFY_EXIT (cnt >= 1);
+ for (int i = 0; i < cnt; i++)
+ {
+ for (int f = 0; f < tests[ntest].nfuncs; f++)
+ {
+ if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
+ TEST_COMPARE (impls[i].usable, false);
+ }
+ }
+
+ _exit (EXIT_SUCCESS);
+}
+
+static int
+do_test (int argc, char *argv[])
+{
+ /* We must have either:
+ - One our fource parameters left if called initially:
+ + path to ld.so optional
+ + "--library-path" optional
+ + the library path optional
+ + the application name
+ + the test to check */
+
+ TEST_VERIFY_EXIT (argc == 2 || argc == 5);
+
+ if (restart)
+ handle_restart (atoi (argv[1]));
+
+ char nteststr[INT_BUFSIZE_BOUND (int)];
+
+ char *spargv[10];
+ {
+ int i = 0;
+ for (; i < argc - 1; i++)
+ spargv[i] = argv[i + 1];
+ spargv[i++] = (char *) "--direct";
+ spargv[i++] = (char *) "--restart";
+ spargv[i++] = nteststr;
+ spargv[i] = NULL;
+ }
+
+ for (int i = 0; i < array_length (tests); i++)
+ {
+ snprintf (nteststr, sizeof nteststr, "%d", i);
+
+ printf ("[%d] Spawned test for %s\n", i, tests[i].env);
+ char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
+ setenv ("GLIBC_TUNABLES", tunable, 1);
+
+ struct support_capture_subprocess result
+ = support_capture_subprogram (spargv[0], spargv, NULL);
+ support_capture_subprocess_check (&result, "tst-tunables", 0,
+ sc_allow_stderr);
+ support_capture_subprocess_free (&result);
+
+ free (tunable);
+ }
+
+ return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
--
2.17.1

View File

@ -0,0 +1,92 @@
From 1e57e1c6aa6ca5a476aba725271c1ace9be345d3 Mon Sep 17 00:00:00 2001
From: Feifei Wang <wangfeifei@hygon.cn>
Date: Mon, 19 Aug 2024 14:57:55 +0800
Subject: [PATCH 10/10] x86: Enable non-temporal memset for Hygon processors
This patch uses 'Avoid_Non_Temporal_Memset' flag to access
the non-temporal memset implementation for hygon processors.
Test Results:
hygon1 arch
x86_memset_non_temporal_threshold = 8MB
size new performance time / old performance time
1MB 0.994
4MB 0.996
8MB 0.670
16MB 0.343
32MB 0.355
hygon2 arch
x86_memset_non_temporal_threshold = 8MB
size new performance time / old performance time
1MB 1
4MB 1
8MB 1.312
16MB 0.822
32MB 0.830
hygon3 arch
x86_memset_non_temporal_threshold = 8MB
size new performance time / old performance time
1MB 1
4MB 0.990
8MB 0.737
16MB 0.390
32MB 0.401
For hygon arch with this patch, non-temporal stores can improve
performance by 20% - 65%.
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 9 +++++++--
sysdeps/x86/dl-cacheinfo.h | 2 +-
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 287edc5b08..f5539aea6f 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -640,9 +640,9 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
- /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+ /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
as of writing this, we only have benchmarks indicatings it profitability
- on Intel/AMD. */
+ on Intel/AMD/Hygon. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|= bit_arch_Avoid_Non_Temporal_Memset;
@@ -998,6 +998,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
get_extended_indices (cpu_features);
update_active (cpu_features);
+
+ /* Benchmarks indicate non-temporal memset can be profitable on Hygon
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
}
else
{
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 5803bfcea8..d4dad8df3b 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* Non-temporal stores are more performant on some hardware above
non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
- Intel and AMD hardware. */
+ Intel, AMD and Hygon hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
memset_non_temporal_threshold = non_temporal_threshold;
--
2.17.1

View File

@ -0,0 +1,47 @@
From 54e99a96ec3b97f53ee018bfa8dbbef2dd13f1e8 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 7 Jun 2024 23:04:47 +0000
Subject: [PATCH 05/10] x86: Enable non-temporal memset tunable for AMD
In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
memset") a tunable threshold for enabling non-temporal memset was added,
but only for Intel hardware.
Since that commit, new benchmark results suggest that non-temporal
memset is beneficial on AMD, as well, so allow this tunable to be set
for AMD.
See:
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
which has been updated to include data using different stategies for
large memset on AMD Zen2, Zen3, and Zen4.
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 9f27da21ce..dfdb4069c7 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,11 +1048,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores in memset have only been tested on Intel hardware.
- Until we benchmark data on other x86 processor, disable non-temporal
- stores in memset. */
+ /* Non-temporal stores are more performant on Intel and AMD hardware above
+ non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel)
+ if (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd)
memset_non_temporal_threshold = non_temporal_threshold;
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
--
2.17.1

View File

@ -0,0 +1,149 @@
From f1ea6401d790764e4fcf02c6fb28e69841c25640 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:38 -0300
Subject: [PATCH 03/10] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
The REP MOVSB usage on memcpy/memmove does not show much performance
improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
as from BZ 30994, if the source is aligned and the destination is not
the performance can be 20x slower.
The performance difference is noticeable with small buffer sizes, closer
to the lower bounds limits when memcpy/memmove starts to use ERMS. The
performance of REP MOVSB is similar to vectorized instruction on the
size limit (the L2 cache). Also, there is no drawback to multiple cores
sharing the cache.
Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 85c404dd26..ce2e6927e4 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -833,7 +833,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
long int data = -1;
long int shared = -1;
long int shared_per_thread = -1;
- long int core = -1;
unsigned int threads = 0;
unsigned long int level1_icache_size = -1;
unsigned long int level1_icache_linesize = -1;
@@ -851,7 +850,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (cpu_features->basic.kind == arch_kind_intel)
{
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
shared_per_thread = shared;
@@ -864,7 +862,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
level1_dcache_linesize
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
- level2_cache_size = core;
+ level2_cache_size
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
level2_cache_assoc
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
level2_cache_linesize
@@ -877,12 +876,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level4_cache_size
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
{
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
shared_per_thread = shared;
@@ -891,19 +890,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_amd)
{
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -911,7 +910,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
@@ -922,12 +921,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (shared <= 0)
{
/* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ shared = level2_cache_size;
}
else if (cpu_features->basic.family < 0x17)
{
/* Account for exclusive L2 and L3 caches. */
- shared += core;
+ shared += level2_cache_size;
}
shared_per_thread = shared;
@@ -1049,6 +1048,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_threshold = non_temporal_threshold;
+
/* The default threshold to use Enhanced REP STOSB. */
unsigned long int rep_stosb_threshold = 2048;
@@ -1090,16 +1095,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
SIZE_MAX);
unsigned long int rep_movsb_stop_threshold;
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
- performing poorly for data above L2 cache size. Henceforth, adding
- an upper bound threshold parameter to limit the usage of Enhanced
- REP MOVSB operations and setting its value to L2 cache size. */
- if (cpu_features->basic.kind == arch_kind_amd)
- rep_movsb_stop_threshold = core;
/* Setting the upper bound of ERMS to the computed value of
- non-temporal threshold for architectures other than AMD. */
- else
- rep_movsb_stop_threshold = non_temporal_threshold;
+ non-temporal threshold for all architectures. */
+ rep_movsb_stop_threshold = non_temporal_threshold;
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
--
2.17.1

View File

@ -0,0 +1,41 @@
From 8a2cea0ae0cbd4120770b81f0be422f60f020e17 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 14 Jun 2024 13:01:58 -0500
Subject: [PATCH 06/10] x86: Fix value for `x86_memset_non_temporal_threshold`
when it is undesirable
When we don't want to use non-temporal stores for memset, we set
`x86_memset_non_temporal_threshold` to SIZE_MAX.
The current code, however, we using `maximum_non_temporal_threshold`
as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
value of `0`.
Fix is to just use `SIZE_MAX` as the upper bound for when setting the
tunable.
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index dfdb4069c7..a76df092e6 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1101,9 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
- TUNABLE_SET_WITH_BOUNDS (
- x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
- minimum_non_temporal_threshold, maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+ memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
--
2.17.1

View File

@ -0,0 +1,66 @@
From fe1ffef2eec9c6634a1e9af951eb68f0f5614470 Mon Sep 17 00:00:00 2001
From: xujing <xujing99@huawei.com>
Date: Thu, 2 Dec 2021 11:41:46 +0800
Subject: [PATCH] glibc: fix CVE-2019-1010023
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
| PT_LOAD
|
| […] Loadable segment entries in the program header table appear in
| ascending order, sorted on the p_vaddr member.
http://www.sco.com/developers/gabi/latest/ch5.pheader.html
Some check needed to fix vulnerability in load commands mapping reported by
https://sourceware.org/bugzilla/show_bug.cgi?id=22851
Signed-off-by: lvying <lvying6@huawei.com>
Signed-off-by: xujing <xujing99@huawei.com>
---
elf/dl-map-segments.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h
index 084076a2..a41ae73b 100644
--- a/elf/dl-map-segments.h
+++ b/elf/dl-map-segments.h
@@ -33,6 +33,7 @@ _dl_map_segments (struct link_map *l, int fd,
struct link_map *loader)
{
const struct loadcmd *c = loadcmds;
+ ElfW(Addr) l_map_end_aligned;
if (__glibc_likely (type == ET_DYN))
{
@@ -61,6 +62,8 @@ _dl_map_segments (struct link_map *l, int fd,
return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
l->l_map_end = l->l_map_start + maplength;
+ l_map_end_aligned = ((l->l_map_end + GLRO(dl_pagesize) - 1)
+ & ~(GLRO(dl_pagesize) - 1));
l->l_addr = l->l_map_start - c->mapstart;
if (has_holes)
@@ -85,10 +88,16 @@ _dl_map_segments (struct link_map *l, int fd,
/* Remember which part of the address space this object uses. */
l->l_map_start = c->mapstart + l->l_addr;
l->l_map_end = l->l_map_start + maplength;
+ l_map_end_aligned = ((l->l_map_end + GLRO(dl_pagesize) - 1)
+ & ~(GLRO(dl_pagesize) - 1));
l->l_contiguous = !has_holes;
while (c < &loadcmds[nloadcmds])
{
+ if ((l->l_addr + c->mapend) > l_map_end_aligned ||
+ (l->l_addr + c->mapstart) < l->l_map_start)
+ return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
+
if (c->mapend > c->mapstart
/* Map the segment contents from the file. */
&& (__mmap ((void *) (l->l_addr + c->mapstart),
--
2.23.0

View File

@ -67,7 +67,7 @@
##############################################################################
Name: glibc
Version: 2.38
Release: 52
Release: 57
Summary: The GNU libc libraries
License: %{all_license}
URL: http://www.gnu.org/software/glibc/
@ -267,6 +267,25 @@ Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
Patch181: backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch
Patch182: backport-x86-Add-new-architecture-type-for-Hygon-processors.patch
Patch183: backport-x86-Add-cache-information-support-for-Hygon-processo.patch
Patch184: backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
Patch185: backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
Patch186: backport-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
Patch187: backport-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
Patch192: assert-Add-test-for-CVE-2025-0395.patch
Patch193: AArch64-Improve-generic-strlen.patch
Patch194: AArch64-Optimize-memset.patch
Patch195: AArch64-Remove-zva_128-from-memset.patch
Patch196: math-Improve-layout-of-expf-data.patch
Patch197: AArch64-Add-SVE-memset.patch
Patch198: AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
Patch199: math-Improve-layout-of-exp-exp10-data.patch
#openEuler patch list
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -310,6 +329,8 @@ Patch9034: 0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
Patch9035: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
Patch9036: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
Patch9037: fix-CVE-2019-1010023.patch
Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
BuildRequires: audit-libs-devel >= 1.1.3, sed >= 3.95, libcap-devel, gettext
@ -646,6 +667,7 @@ mkdir $builddir
pushd $builddir
../configure CC="%GCC" CXX="%GXX" CFLAGS="$BuildFlags" LDFLAGS="$LinkFlags" \
--prefix=%{_prefix} \
--enable-hardcoded-path-in-tests \
--with-headers=%{_prefix}/include $EnableKernel \
--with-nonshared-cflags=-Wp,-D_FORTIFY_SOURCE=2 \
--enable-bind-now \
@ -1085,7 +1107,9 @@ function removeLoadPath()
currPath=$(echo $runpathInfo | awk -F "RUNPATH=" '{print $2}')
fi
if [ x"$currPath" == x"\$ORIGIN" ]; then
# 2dcaf7064 using rpath instead of runpath with --enable-hardcoded-path-in-tests
# using "\$ORIGIN"* to match rpath address
if [[ x"$currPath" == x"\$ORIGIN"* ]]; then
chrpath -d $file
findReliantLib $file
@ -1486,6 +1510,37 @@ fi
%endif
%changelog
* Fri Mar 28 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-57
- math: Improve layout of exp/exp10 data
- AArch64: Use prefer_sve_ifuncs for SVE memset
- AArch64: Add SVE memset
- math: Improve layout of expf data
- AArch64: Remove zva_128 from memset
- AArch64: Optimize memset
- AArch64: Improve generic strlen
- assert: Add test for CVE-2025-0395
* Wed Mar 12 2025 xiajimei <xiejiamei@hygon.cn> - 2.38-56
- x86: Enable non-temporal memset for Hygon processors
- x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS
- x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
- x86: Disable non-temporal memset on Skylake Server
- x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable
- x86: Enable non-temporal memset tunable for AMD
- x86: Add seperate non-temporal tunable for memset
- x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
- x86: Add cache information support for Hygon processors
- x86: Add new architecture type for Hygon processors
* Sat Mar 08 2025 shixuantong <shixuantong1@huawei.com> - 2.38-55
- elf: Keep using minimal malloc after early DTV resize
* Tue Feb 18 2025 shixuantong <shixuantong1@huawei.com> - 2.38-54
- glibc testcase use newly built ld.so instead of environment default installed ld.so
* Thu Feb 06 2025 shixuantong <shixuantong1@huawei.com> - 2.38-53
- fix CVE-2019-1010023
* Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
- stdlib: Test using setenv with updated environ [BZ #32588]
- Fix underallocation of abort_msg_s struct (CVE-2025-0395)

View File

@ -0,0 +1,39 @@
From 5a08d049dc5037e89eb95bb1506652f0043fa39e Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Fri, 13 Dec 2024 15:43:07 +0000
Subject: [PATCH] math: Improve layout of exp/exp10 data
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
changes the exp_data struct slightly so that the fields are better aligned
and without gaps. As a result on targets that support them, more load-pair
instructions are used in exp.
The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
Neoverse V2.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
---
sysdeps/ieee754/dbl-64/math_config.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 19af33fd86..52b720ecd1 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -195,10 +195,11 @@ check_uflow (double x)
extern const struct exp_data
{
double invln2N;
- double shift;
double negln2hiN;
double negln2loN;
double poly[4]; /* Last four coefficients. */
+ double shift;
+
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
--
2.27.0

View File

@ -0,0 +1,34 @@
From 3de5112326a4274c97f154f3d335c11965ee960c Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed, 24 Jul 2024 15:17:47 +0100
Subject: [PATCH] math: Improve layout of expf data
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
changes the exp2f_data struct slightly so that the fields are better aligned.
As a result on targets that support them, load-pair instructions accessing
poly_scaled and invln2_scaled are now 16-byte aligned.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
---
sysdeps/ieee754/flt-32/math_config.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
index d1b06a1a90..5904eb9bac 100644
--- a/sysdeps/ieee754/flt-32/math_config.h
+++ b/sysdeps/ieee754/flt-32/math_config.h
@@ -166,9 +166,9 @@ extern const struct exp2f_data
uint64_t tab[1 << EXP2F_TABLE_BITS];
double shift_scaled;
double poly[EXP2F_POLY_ORDER];
- double shift;
double invln2_scaled;
double poly_scaled[EXP2F_POLY_ORDER];
+ double shift;
} __exp2f_data attribute_hidden;
#define LOGF_TABLE_BITS 4
--
2.27.0