Compare commits
10 Commits
de3e1f1fa3
...
5538373d14
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5538373d14 | ||
|
|
b453407cdf | ||
|
|
d9f212c1d8 | ||
|
|
cb3e0b2e06 | ||
|
|
a2a517c64a | ||
|
|
e214ed3103 | ||
|
|
2b4695acd8 | ||
|
|
b606cd617b | ||
|
|
2f571cfc1d | ||
|
|
9b958700fd |
200
AArch64-Add-SVE-memset.patch
Normal file
200
AArch64-Add-SVE-memset.patch
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
From 52c2b1556f773d9a75d030160e0e273a5ea84502 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Tue, 24 Dec 2024 18:01:59 +0000
|
||||||
|
Subject: [PATCH] AArch64: Add SVE memset
|
||||||
|
|
||||||
|
Add SVE memset based on the generic memset with predicated load for sizes < 16.
|
||||||
|
Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
|
||||||
|
stores for the last 64 bytes. Performance of random memset benchmark improves
|
||||||
|
by ~2% on Neoverse V1.
|
||||||
|
|
||||||
|
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||||
|
(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
|
||||||
|
---
|
||||||
|
sysdeps/aarch64/multiarch/Makefile | 1 +
|
||||||
|
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +-
|
||||||
|
sysdeps/aarch64/multiarch/memset.c | 4 +
|
||||||
|
sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 +++++++++++++++++++
|
||||||
|
4 files changed, 130 insertions(+), 1 deletion(-)
|
||||||
|
create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
|
||||||
|
index e4720b7468..214b6137b0 100644
|
||||||
|
--- a/sysdeps/aarch64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/aarch64/multiarch/Makefile
|
||||||
|
@@ -14,6 +14,7 @@ sysdep_routines += \
|
||||||
|
memset_generic \
|
||||||
|
memset_kunpeng \
|
||||||
|
memset_mops \
|
||||||
|
+ memset_sve_zva64 \
|
||||||
|
memset_zva64 \
|
||||||
|
strlen_asimd \
|
||||||
|
strlen_generic \
|
||||||
|
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||||
|
index 73038ac810..2fa6baa319 100644
|
||||||
|
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -56,7 +56,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
|
||||||
|
#if HAVE_AARCH64_SVE_ASM
|
||||||
|
- IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
|
||||||
|
#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
|
||||||
|
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
|
||||||
|
index 6deb6865e5..89fde57f42 100644
|
||||||
|
--- a/sysdeps/aarch64/multiarch/memset.c
|
||||||
|
+++ b/sysdeps/aarch64/multiarch/memset.c
|
||||||
|
@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
|
||||||
|
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
|
||||||
|
|
||||||
|
static inline __typeof (__redirect_memset) *
|
||||||
|
select_memset_ifunc (void)
|
||||||
|
@@ -47,6 +48,9 @@ select_memset_ifunc (void)
|
||||||
|
{
|
||||||
|
if (IS_A64FX (midr) && zva_size == 256)
|
||||||
|
return __memset_a64fx;
|
||||||
|
+
|
||||||
|
+ if (zva_size == 64)
|
||||||
|
+ return __memset_sve_zva64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_KUNPENG920 (midr))
|
||||||
|
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..7fb40fdd9e
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
|
||||||
|
@@ -0,0 +1,123 @@
|
||||||
|
+/* Optimized memset for SVE.
|
||||||
|
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||||
|
+
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library. If not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+/* Assumptions:
|
||||||
|
+ *
|
||||||
|
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
|
||||||
|
+ * ZVA size is 64.
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+#if HAVE_AARCH64_SVE_ASM
|
||||||
|
+
|
||||||
|
+.arch armv8.2-a+sve
|
||||||
|
+
|
||||||
|
+#define dstin x0
|
||||||
|
+#define val x1
|
||||||
|
+#define valw w1
|
||||||
|
+#define count x2
|
||||||
|
+#define dst x3
|
||||||
|
+#define dstend x4
|
||||||
|
+#define zva_val x5
|
||||||
|
+#define vlen x5
|
||||||
|
+#define off x3
|
||||||
|
+#define dstend2 x5
|
||||||
|
+
|
||||||
|
+ENTRY (__memset_sve_zva64)
|
||||||
|
+ dup v0.16B, valw
|
||||||
|
+ cmp count, 16
|
||||||
|
+ b.lo L(set_16)
|
||||||
|
+
|
||||||
|
+ add dstend, dstin, count
|
||||||
|
+ cmp count, 64
|
||||||
|
+ b.hs L(set_128)
|
||||||
|
+
|
||||||
|
+ /* Set 16..63 bytes. */
|
||||||
|
+ mov off, 16
|
||||||
|
+ and off, off, count, lsr 1
|
||||||
|
+ sub dstend2, dstend, off
|
||||||
|
+ str q0, [dstin]
|
||||||
|
+ str q0, [dstin, off]
|
||||||
|
+ str q0, [dstend2, -16]
|
||||||
|
+ str q0, [dstend, -16]
|
||||||
|
+ ret
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(set_16):
|
||||||
|
+ whilelo p0.b, xzr, count
|
||||||
|
+ st1b z0.b, p0, [dstin]
|
||||||
|
+ ret
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(set_128):
|
||||||
|
+ bic dst, dstin, 15
|
||||||
|
+ cmp count, 128
|
||||||
|
+ b.hi L(set_long)
|
||||||
|
+ stp q0, q0, [dstin]
|
||||||
|
+ stp q0, q0, [dstin, 32]
|
||||||
|
+ stp q0, q0, [dstend, -64]
|
||||||
|
+ stp q0, q0, [dstend, -32]
|
||||||
|
+ ret
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(set_long):
|
||||||
|
+ cmp count, 256
|
||||||
|
+ b.lo L(no_zva)
|
||||||
|
+ tst valw, 255
|
||||||
|
+ b.ne L(no_zva)
|
||||||
|
+
|
||||||
|
+ str q0, [dstin]
|
||||||
|
+ str q0, [dst, 16]
|
||||||
|
+ bic dst, dstin, 31
|
||||||
|
+ stp q0, q0, [dst, 32]
|
||||||
|
+ bic dst, dstin, 63
|
||||||
|
+ sub count, dstend, dst /* Count is now 64 too large. */
|
||||||
|
+ sub count, count, 128 /* Adjust count and bias for loop. */
|
||||||
|
+
|
||||||
|
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
|
||||||
|
+ bic x8, x8, 15
|
||||||
|
+ stp q0, q0, [x8, -48]
|
||||||
|
+ str q0, [x8, -16]
|
||||||
|
+ str q0, [dstend, -16]
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(zva64_loop):
|
||||||
|
+ add dst, dst, 64
|
||||||
|
+ dc zva, dst
|
||||||
|
+ subs count, count, 64
|
||||||
|
+ b.hi L(zva64_loop)
|
||||||
|
+ ret
|
||||||
|
+
|
||||||
|
+L(no_zva):
|
||||||
|
+ str q0, [dstin]
|
||||||
|
+ sub count, dstend, dst /* Count is 16 too large. */
|
||||||
|
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||||
|
+L(no_zva_loop):
|
||||||
|
+ stp q0, q0, [dst, 16]
|
||||||
|
+ stp q0, q0, [dst, 48]
|
||||||
|
+ add dst, dst, 64
|
||||||
|
+ subs count, count, 64
|
||||||
|
+ b.hi L(no_zva_loop)
|
||||||
|
+ stp q0, q0, [dstend, -64]
|
||||||
|
+ stp q0, q0, [dstend, -32]
|
||||||
|
+ ret
|
||||||
|
+
|
||||||
|
+END (__memset_sve_zva64)
|
||||||
|
+#endif
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
92
AArch64-Improve-generic-strlen.patch
Normal file
92
AArch64-Improve-generic-strlen.patch
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Wed, 7 Aug 2024 14:43:47 +0100
|
||||||
|
Subject: [PATCH] AArch64: Improve generic strlen
|
||||||
|
|
||||||
|
Improve performance by handling another 16 bytes before entering the loop.
|
||||||
|
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
|
||||||
|
size computation to avoid increasing latency. On Neoverse V1 performance
|
||||||
|
of the random strlen benchmark improves by 4.6%.
|
||||||
|
|
||||||
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
|
||||||
|
---
|
||||||
|
sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------
|
||||||
|
1 file changed, 27 insertions(+), 12 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
|
||||||
|
index 133ef93342..352fb40d3a 100644
|
||||||
|
--- a/sysdeps/aarch64/strlen.S
|
||||||
|
+++ b/sysdeps/aarch64/strlen.S
|
||||||
|
@@ -1,4 +1,5 @@
|
||||||
|
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
|
||||||
|
+/* Generic optimized strlen using SIMD.
|
||||||
|
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
|
||||||
|
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||||
|
fmov synd, dend
|
||||||
|
lsr synd, synd, shift
|
||||||
|
- cbz synd, L(loop)
|
||||||
|
+ cbz synd, L(next16)
|
||||||
|
|
||||||
|
rbit synd, synd
|
||||||
|
clz result, synd
|
||||||
|
lsr result, result, 2
|
||||||
|
ret
|
||||||
|
|
||||||
|
+L(next16):
|
||||||
|
+ ldr data, [src, 16]
|
||||||
|
+ cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
|
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||||
|
+ fmov synd, dend
|
||||||
|
+ cbz synd, L(loop)
|
||||||
|
+ add src, src, 16
|
||||||
|
+#ifndef __AARCH64EB__
|
||||||
|
+ rbit synd, synd
|
||||||
|
+#endif
|
||||||
|
+ sub result, src, srcin
|
||||||
|
+ clz tmp, synd
|
||||||
|
+ add result, result, tmp, lsr 2
|
||||||
|
+ ret
|
||||||
|
+
|
||||||
|
.p2align 5
|
||||||
|
L(loop):
|
||||||
|
- ldr data, [src, 16]
|
||||||
|
+ ldr data, [src, 32]!
|
||||||
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
|
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||||
|
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
|
||||||
|
fmov synd, dend
|
||||||
|
cbnz synd, L(loop_end)
|
||||||
|
- ldr data, [src, 32]!
|
||||||
|
+ ldr data, [src, 16]
|
||||||
|
cmeq vhas_nul.16b, vdata.16b, 0
|
||||||
|
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||||
|
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
|
||||||
|
fmov synd, dend
|
||||||
|
cbz synd, L(loop)
|
||||||
|
- sub src, src, 16
|
||||||
|
+ add src, src, 16
|
||||||
|
L(loop_end):
|
||||||
|
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||||
|
- sub result, src, srcin
|
||||||
|
- fmov synd, dend
|
||||||
|
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
|
||||||
|
#ifndef __AARCH64EB__
|
||||||
|
rbit synd, synd
|
||||||
|
+ sub result, result, 3
|
||||||
|
#endif
|
||||||
|
- add result, result, 16
|
||||||
|
clz tmp, synd
|
||||||
|
- add result, result, tmp, lsr 2
|
||||||
|
+ sub result, tmp, result
|
||||||
|
+ lsr result, result, 2
|
||||||
|
ret
|
||||||
|
|
||||||
|
END (STRLEN)
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
287
AArch64-Optimize-memset.patch
Normal file
287
AArch64-Optimize-memset.patch
Normal file
@ -0,0 +1,287 @@
|
|||||||
|
From 95aa21432ccbf77225abd485d98df36ba760ff80 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Mon, 9 Sep 2024 15:26:47 +0100
|
||||||
|
Subject: [PATCH] AArch64: Optimize memset
|
||||||
|
|
||||||
|
Improve small memsets by avoiding branches and use overlapping stores.
|
||||||
|
Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes
|
||||||
|
other than 64 and 128. Performance of random memset benchmark improves by 24%
|
||||||
|
on Neoverse N1.
|
||||||
|
|
||||||
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
|
||||||
|
---
|
||||||
|
sysdeps/aarch64/memset.S | 195 +++++++++++++++++----------------------
|
||||||
|
1 file changed, 84 insertions(+), 111 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
|
||||||
|
index bbfb7184c3..caafb019e2 100644
|
||||||
|
--- a/sysdeps/aarch64/memset.S
|
||||||
|
+++ b/sysdeps/aarch64/memset.S
|
||||||
|
@@ -1,4 +1,5 @@
|
||||||
|
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
|
||||||
|
+/* Generic optimized memset using SIMD.
|
||||||
|
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
@@ -17,7 +18,6 @@
|
||||||
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
-#include "memset-reg.h"
|
||||||
|
|
||||||
|
#ifndef MEMSET
|
||||||
|
# define MEMSET memset
|
||||||
|
@@ -25,130 +25,132 @@
|
||||||
|
|
||||||
|
/* Assumptions:
|
||||||
|
*
|
||||||
|
- * ARMv8-a, AArch64, unaligned accesses
|
||||||
|
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
-ENTRY (MEMSET)
|
||||||
|
+#define dstin x0
|
||||||
|
+#define val x1
|
||||||
|
+#define valw w1
|
||||||
|
+#define count x2
|
||||||
|
+#define dst x3
|
||||||
|
+#define dstend x4
|
||||||
|
+#define zva_val x5
|
||||||
|
+#define off x3
|
||||||
|
+#define dstend2 x5
|
||||||
|
|
||||||
|
+ENTRY (MEMSET)
|
||||||
|
PTR_ARG (0)
|
||||||
|
SIZE_ARG (2)
|
||||||
|
|
||||||
|
dup v0.16B, valw
|
||||||
|
+ cmp count, 16
|
||||||
|
+ b.lo L(set_small)
|
||||||
|
+
|
||||||
|
add dstend, dstin, count
|
||||||
|
+ cmp count, 64
|
||||||
|
+ b.hs L(set_128)
|
||||||
|
|
||||||
|
- cmp count, 96
|
||||||
|
- b.hi L(set_long)
|
||||||
|
- cmp count, 16
|
||||||
|
- b.hs L(set_medium)
|
||||||
|
- mov val, v0.D[0]
|
||||||
|
+ /* Set 16..63 bytes. */
|
||||||
|
+ mov off, 16
|
||||||
|
+ and off, off, count, lsr 1
|
||||||
|
+ sub dstend2, dstend, off
|
||||||
|
+ str q0, [dstin]
|
||||||
|
+ str q0, [dstin, off]
|
||||||
|
+ str q0, [dstend2, -16]
|
||||||
|
+ str q0, [dstend, -16]
|
||||||
|
+ ret
|
||||||
|
|
||||||
|
+ .p2align 4
|
||||||
|
/* Set 0..15 bytes. */
|
||||||
|
- tbz count, 3, 1f
|
||||||
|
- str val, [dstin]
|
||||||
|
- str val, [dstend, -8]
|
||||||
|
- ret
|
||||||
|
- nop
|
||||||
|
-1: tbz count, 2, 2f
|
||||||
|
- str valw, [dstin]
|
||||||
|
- str valw, [dstend, -4]
|
||||||
|
+L(set_small):
|
||||||
|
+ add dstend, dstin, count
|
||||||
|
+ cmp count, 4
|
||||||
|
+ b.lo 2f
|
||||||
|
+ lsr off, count, 3
|
||||||
|
+ sub dstend2, dstend, off, lsl 2
|
||||||
|
+ str s0, [dstin]
|
||||||
|
+ str s0, [dstin, off, lsl 2]
|
||||||
|
+ str s0, [dstend2, -4]
|
||||||
|
+ str s0, [dstend, -4]
|
||||||
|
ret
|
||||||
|
+
|
||||||
|
+ /* Set 0..3 bytes. */
|
||||||
|
2: cbz count, 3f
|
||||||
|
+ lsr off, count, 1
|
||||||
|
strb valw, [dstin]
|
||||||
|
- tbz count, 1, 3f
|
||||||
|
- strh valw, [dstend, -2]
|
||||||
|
+ strb valw, [dstin, off]
|
||||||
|
+ strb valw, [dstend, -1]
|
||||||
|
3: ret
|
||||||
|
|
||||||
|
- /* Set 17..96 bytes. */
|
||||||
|
-L(set_medium):
|
||||||
|
- str q0, [dstin]
|
||||||
|
- tbnz count, 6, L(set96)
|
||||||
|
- str q0, [dstend, -16]
|
||||||
|
- tbz count, 5, 1f
|
||||||
|
- str q0, [dstin, 16]
|
||||||
|
- str q0, [dstend, -32]
|
||||||
|
-1: ret
|
||||||
|
-
|
||||||
|
.p2align 4
|
||||||
|
- /* Set 64..96 bytes. Write 64 bytes from the start and
|
||||||
|
- 32 bytes from the end. */
|
||||||
|
-L(set96):
|
||||||
|
- str q0, [dstin, 16]
|
||||||
|
+L(set_128):
|
||||||
|
+ bic dst, dstin, 15
|
||||||
|
+ cmp count, 128
|
||||||
|
+ b.hi L(set_long)
|
||||||
|
+ stp q0, q0, [dstin]
|
||||||
|
stp q0, q0, [dstin, 32]
|
||||||
|
+ stp q0, q0, [dstend, -64]
|
||||||
|
stp q0, q0, [dstend, -32]
|
||||||
|
ret
|
||||||
|
|
||||||
|
- .p2align 3
|
||||||
|
- nop
|
||||||
|
+ .p2align 4
|
||||||
|
L(set_long):
|
||||||
|
- and valw, valw, 255
|
||||||
|
- bic dst, dstin, 15
|
||||||
|
str q0, [dstin]
|
||||||
|
- cmp count, 256
|
||||||
|
- ccmp valw, 0, 0, cs
|
||||||
|
- b.eq L(try_zva)
|
||||||
|
-L(no_zva):
|
||||||
|
- sub count, dstend, dst /* Count is 16 too large. */
|
||||||
|
- sub dst, dst, 16 /* Dst is biased by -32. */
|
||||||
|
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||||
|
-1: stp q0, q0, [dst, 32]
|
||||||
|
- stp q0, q0, [dst, 64]!
|
||||||
|
-L(tail64):
|
||||||
|
- subs count, count, 64
|
||||||
|
- b.hi 1b
|
||||||
|
-2: stp q0, q0, [dstend, -64]
|
||||||
|
+ str q0, [dst, 16]
|
||||||
|
+ tst valw, 255
|
||||||
|
+ b.ne L(no_zva)
|
||||||
|
+#ifndef ZVA64_ONLY
|
||||||
|
+ mrs zva_val, dczid_el0
|
||||||
|
+ and zva_val, zva_val, 31
|
||||||
|
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||||
|
+ b.ne L(zva_128)
|
||||||
|
+#endif
|
||||||
|
+ stp q0, q0, [dst, 32]
|
||||||
|
+ bic dst, dstin, 63
|
||||||
|
+ sub count, dstend, dst /* Count is now 64 too large. */
|
||||||
|
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
|
||||||
|
+
|
||||||
|
+ /* Write last bytes before ZVA loop. */
|
||||||
|
+ stp q0, q0, [dstend, -64]
|
||||||
|
stp q0, q0, [dstend, -32]
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(zva64_loop):
|
||||||
|
+ add dst, dst, 64
|
||||||
|
+ dc zva, dst
|
||||||
|
+ subs count, count, 64
|
||||||
|
+ b.hi L(zva64_loop)
|
||||||
|
ret
|
||||||
|
|
||||||
|
-L(try_zva):
|
||||||
|
-#ifndef ZVA64_ONLY
|
||||||
|
.p2align 3
|
||||||
|
- mrs tmp1, dczid_el0
|
||||||
|
- tbnz tmp1w, 4, L(no_zva)
|
||||||
|
- and tmp1w, tmp1w, 15
|
||||||
|
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
|
||||||
|
- b.ne L(zva_128)
|
||||||
|
- nop
|
||||||
|
-#endif
|
||||||
|
- /* Write the first and last 64 byte aligned block using stp rather
|
||||||
|
- than using DC ZVA. This is faster on some cores.
|
||||||
|
- */
|
||||||
|
- .p2align 4
|
||||||
|
-L(zva_64):
|
||||||
|
- str q0, [dst, 16]
|
||||||
|
+L(no_zva):
|
||||||
|
+ sub count, dstend, dst /* Count is 32 too large. */
|
||||||
|
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
|
||||||
|
+L(no_zva_loop):
|
||||||
|
stp q0, q0, [dst, 32]
|
||||||
|
- bic dst, dst, 63
|
||||||
|
stp q0, q0, [dst, 64]
|
||||||
|
- stp q0, q0, [dst, 96]
|
||||||
|
- sub count, dstend, dst /* Count is now 128 too large. */
|
||||||
|
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
|
||||||
|
- add dst, dst, 128
|
||||||
|
-1: dc zva, dst
|
||||||
|
add dst, dst, 64
|
||||||
|
subs count, count, 64
|
||||||
|
- b.hi 1b
|
||||||
|
- stp q0, q0, [dst, 0]
|
||||||
|
- stp q0, q0, [dst, 32]
|
||||||
|
+ b.hi L(no_zva_loop)
|
||||||
|
stp q0, q0, [dstend, -64]
|
||||||
|
stp q0, q0, [dstend, -32]
|
||||||
|
ret
|
||||||
|
|
||||||
|
#ifndef ZVA64_ONLY
|
||||||
|
- .p2align 3
|
||||||
|
+ .p2align 4
|
||||||
|
L(zva_128):
|
||||||
|
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
|
||||||
|
- b.ne L(zva_other)
|
||||||
|
+ cmp zva_val, 5 /* ZVA size is 128 bytes. */
|
||||||
|
+ b.ne L(no_zva)
|
||||||
|
|
||||||
|
- str q0, [dst, 16]
|
||||||
|
stp q0, q0, [dst, 32]
|
||||||
|
stp q0, q0, [dst, 64]
|
||||||
|
stp q0, q0, [dst, 96]
|
||||||
|
bic dst, dst, 127
|
||||||
|
sub count, dstend, dst /* Count is now 128 too large. */
|
||||||
|
- sub count, count, 128+128 /* Adjust count and bias for loop. */
|
||||||
|
- add dst, dst, 128
|
||||||
|
-1: dc zva, dst
|
||||||
|
- add dst, dst, 128
|
||||||
|
+ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
|
||||||
|
+1: add dst, dst, 128
|
||||||
|
+ dc zva, dst
|
||||||
|
subs count, count, 128
|
||||||
|
b.hi 1b
|
||||||
|
stp q0, q0, [dstend, -128]
|
||||||
|
@@ -156,35 +158,6 @@ L(zva_128):
|
||||||
|
stp q0, q0, [dstend, -64]
|
||||||
|
stp q0, q0, [dstend, -32]
|
||||||
|
ret
|
||||||
|
-
|
||||||
|
-L(zva_other):
|
||||||
|
- mov tmp2w, 4
|
||||||
|
- lsl zva_lenw, tmp2w, tmp1w
|
||||||
|
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
|
||||||
|
- cmp count, tmp1
|
||||||
|
- blo L(no_zva)
|
||||||
|
-
|
||||||
|
- sub tmp2, zva_len, 1
|
||||||
|
- add tmp1, dst, zva_len
|
||||||
|
- add dst, dst, 16
|
||||||
|
- subs count, tmp1, dst /* Actual alignment bytes to write. */
|
||||||
|
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
|
||||||
|
- beq 2f
|
||||||
|
-1: stp q0, q0, [dst], 64
|
||||||
|
- stp q0, q0, [dst, -32]
|
||||||
|
- subs count, count, 64
|
||||||
|
- b.hi 1b
|
||||||
|
-2: mov dst, tmp1
|
||||||
|
- sub count, dstend, tmp1 /* Remaining bytes to write. */
|
||||||
|
- subs count, count, zva_len
|
||||||
|
- b.lo 4f
|
||||||
|
-3: dc zva, dst
|
||||||
|
- add dst, dst, zva_len
|
||||||
|
- subs count, count, zva_len
|
||||||
|
- b.hs 3b
|
||||||
|
-4: add count, count, zva_len
|
||||||
|
- sub dst, dst, 32 /* Bias dst for tail loop. */
|
||||||
|
- b L(tail64)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
END (MEMSET)
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
65
AArch64-Remove-zva_128-from-memset.patch
Normal file
65
AArch64-Remove-zva_128-from-memset.patch
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
From 5fe151d86a19bc3dc791fd2d92efeb6c6e11cf64 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Mon, 25 Nov 2024 18:43:08 +0000
|
||||||
|
Subject: [PATCH] AArch64: Remove zva_128 from memset
|
||||||
|
|
||||||
|
Remove ZVA 128 support from memset - the new memset no longer
|
||||||
|
guarantees count >= 256, which can result in underflow and a
|
||||||
|
crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA
|
||||||
|
size of 128 and its memcpy implementation was removed in commit
|
||||||
|
e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
|
||||||
|
case too.
|
||||||
|
|
||||||
|
[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
|
||||||
|
|
||||||
|
Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
|
||||||
|
(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
|
||||||
|
---
|
||||||
|
sysdeps/aarch64/memset.S | 25 +------------------------
|
||||||
|
1 file changed, 1 insertion(+), 24 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
|
||||||
|
index caafb019e2..71814d0b2f 100644
|
||||||
|
--- a/sysdeps/aarch64/memset.S
|
||||||
|
+++ b/sysdeps/aarch64/memset.S
|
||||||
|
@@ -104,7 +104,7 @@ L(set_long):
|
||||||
|
mrs zva_val, dczid_el0
|
||||||
|
and zva_val, zva_val, 31
|
||||||
|
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||||
|
- b.ne L(zva_128)
|
||||||
|
+ b.ne L(no_zva)
|
||||||
|
#endif
|
||||||
|
stp q0, q0, [dst, 32]
|
||||||
|
bic dst, dstin, 63
|
||||||
|
@@ -137,28 +137,5 @@ L(no_zva_loop):
|
||||||
|
stp q0, q0, [dstend, -32]
|
||||||
|
ret
|
||||||
|
|
||||||
|
-#ifndef ZVA64_ONLY
|
||||||
|
- .p2align 4
|
||||||
|
-L(zva_128):
|
||||||
|
- cmp zva_val, 5 /* ZVA size is 128 bytes. */
|
||||||
|
- b.ne L(no_zva)
|
||||||
|
-
|
||||||
|
- stp q0, q0, [dst, 32]
|
||||||
|
- stp q0, q0, [dst, 64]
|
||||||
|
- stp q0, q0, [dst, 96]
|
||||||
|
- bic dst, dst, 127
|
||||||
|
- sub count, dstend, dst /* Count is now 128 too large. */
|
||||||
|
- sub count, count, 128 + 128 /* Adjust count and bias for loop. */
|
||||||
|
-1: add dst, dst, 128
|
||||||
|
- dc zva, dst
|
||||||
|
- subs count, count, 128
|
||||||
|
- b.hi 1b
|
||||||
|
- stp q0, q0, [dstend, -128]
|
||||||
|
- stp q0, q0, [dstend, -96]
|
||||||
|
- stp q0, q0, [dstend, -64]
|
||||||
|
- stp q0, q0, [dstend, -32]
|
||||||
|
- ret
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
END (MEMSET)
|
||||||
|
libc_hidden_builtin_def (MEMSET)
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
29
AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
Normal file
29
AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
From 097299ffa904b327fce83770fa6a522e4393ddb3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Thu, 27 Feb 2025 16:28:52 +0000
|
||||||
|
Subject: [PATCH] AArch64: Use prefer_sve_ifuncs for SVE memset
|
||||||
|
|
||||||
|
Use prefer_sve_ifuncs for SVE memset just like memcpy.
|
||||||
|
|
||||||
|
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||||
|
(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
|
||||||
|
---
|
||||||
|
sysdeps/aarch64/multiarch/memset.c | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
|
||||||
|
index 89fde57f42..ce5d35a20e 100644
|
||||||
|
--- a/sysdeps/aarch64/multiarch/memset.c
|
||||||
|
+++ b/sysdeps/aarch64/multiarch/memset.c
|
||||||
|
@@ -49,7 +49,7 @@ select_memset_ifunc (void)
|
||||||
|
if (IS_A64FX (midr) && zva_size == 256)
|
||||||
|
return __memset_a64fx;
|
||||||
|
|
||||||
|
- if (zva_size == 64)
|
||||||
|
+ if (prefer_sve_ifuncs && zva_size == 64)
|
||||||
|
return __memset_sve_zva64;
|
||||||
|
}
|
||||||
|
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
132
assert-Add-test-for-CVE-2025-0395.patch
Normal file
132
assert-Add-test-for-CVE-2025-0395.patch
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
From f984e2d7e8299726891a1a497a3c36cd5542a0bf Mon Sep 17 00:00:00 2001
|
||||||
|
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||||
|
Date: Fri, 31 Jan 2025 12:16:30 -0500
|
||||||
|
Subject: [PATCH] assert: Add test for CVE-2025-0395
|
||||||
|
|
||||||
|
Use the __progname symbol to override the program name to induce the
|
||||||
|
failure that CVE-2025-0395 describes.
|
||||||
|
|
||||||
|
This is related to BZ #32582
|
||||||
|
|
||||||
|
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||||
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
|
||||||
|
---
|
||||||
|
assert/Makefile | 1 +
|
||||||
|
assert/tst-assert-sa-2025-0001.c | 92 ++++++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 93 insertions(+)
|
||||||
|
create mode 100644 assert/tst-assert-sa-2025-0001.c
|
||||||
|
|
||||||
|
diff --git a/assert/Makefile b/assert/Makefile
|
||||||
|
index 67f4e6a570..b0fc9fc4d2 100644
|
||||||
|
--- a/assert/Makefile
|
||||||
|
+++ b/assert/Makefile
|
||||||
|
@@ -38,6 +38,7 @@ tests := \
|
||||||
|
test-assert-perr \
|
||||||
|
tst-assert-c++ \
|
||||||
|
tst-assert-g++ \
|
||||||
|
+ tst-assert-sa-2025-0001 \
|
||||||
|
# tests
|
||||||
|
|
||||||
|
ifeq ($(have-cxx-thread_local),yes)
|
||||||
|
diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..102cb0078d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/assert/tst-assert-sa-2025-0001.c
|
||||||
|
@@ -0,0 +1,92 @@
|
||||||
|
+/* Test for CVE-2025-0395.
|
||||||
|
+ Copyright The GNU Toolchain Authors.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+/* Test that a large enough __progname does not result in a buffer overflow
|
||||||
|
+ when printing an assertion failure. This was CVE-2025-0395. */
|
||||||
|
+#include <assert.h>
|
||||||
|
+#include <inttypes.h>
|
||||||
|
+#include <signal.h>
|
||||||
|
+#include <stdbool.h>
|
||||||
|
+#include <string.h>
|
||||||
|
+#include <sys/mman.h>
|
||||||
|
+#include <support/check.h>
|
||||||
|
+#include <support/support.h>
|
||||||
|
+#include <support/xstdio.h>
|
||||||
|
+#include <support/xunistd.h>
|
||||||
|
+
|
||||||
|
+extern const char *__progname;
|
||||||
|
+
|
||||||
|
+int
|
||||||
|
+do_test (int argc, char **argv)
|
||||||
|
+{
|
||||||
|
+
|
||||||
|
+ support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
|
||||||
|
+ ignore_stderr ();
|
||||||
|
+
|
||||||
|
+ /* XXX assumes that the assert is on a 2 digit line number. */
|
||||||
|
+ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
|
||||||
|
+
|
||||||
|
+ int ret = fprintf (stderr, prompt, __FILE__);
|
||||||
|
+ if (ret < 0)
|
||||||
|
+ FAIL_EXIT1 ("fprintf failed: %m\n");
|
||||||
|
+
|
||||||
|
+ size_t pagesize = getpagesize ();
|
||||||
|
+ size_t namesize = pagesize - 1 - ret;
|
||||||
|
+
|
||||||
|
+ /* Alter the progname so that the assert message fills the entire page. */
|
||||||
|
+ char progname[namesize];
|
||||||
|
+ memset (progname, 'A', namesize - 1);
|
||||||
|
+ progname[namesize - 1] = '\0';
|
||||||
|
+ __progname = progname;
|
||||||
|
+
|
||||||
|
+ FILE *f = xfopen ("/proc/self/maps", "r");
|
||||||
|
+ char *line = NULL;
|
||||||
|
+ size_t len = 0;
|
||||||
|
+ uintptr_t prev_to = 0;
|
||||||
|
+
|
||||||
|
+ /* Pad the beginning of every writable mapping with a PROT_NONE map. This
|
||||||
|
+ ensures that the mmap in the assert_fail path never ends up below a
|
||||||
|
+ writable map and will terminate immediately in case of a buffer
|
||||||
|
+ overflow. */
|
||||||
|
+ while (xgetline (&line, &len, f))
|
||||||
|
+ {
|
||||||
|
+ uintptr_t from, to;
|
||||||
|
+ char perm[4];
|
||||||
|
+
|
||||||
|
+ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
|
||||||
|
+ &from, &to,
|
||||||
|
+ &perm[0], &perm[1], &perm[2], &perm[3]);
|
||||||
|
+
|
||||||
|
+ bool writable = (memchr (perm, 'w', 4) != NULL);
|
||||||
|
+
|
||||||
|
+ if (prev_to != 0 && from - prev_to > pagesize && writable)
|
||||||
|
+ xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
|
||||||
|
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
|
||||||
|
+
|
||||||
|
+ prev_to = to;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ xfclose (f);
|
||||||
|
+
|
||||||
|
+ assert (argc < 1);
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#define EXPECTED_SIGNAL SIGABRT
|
||||||
|
+#define TEST_FUNCTION_ARGV do_test
|
||||||
|
+#include <support/test-driver.c>
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
210
backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
Normal file
210
backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
From 17f7ca193d60fefd6cc5e48aacd1ce9f7dd29862 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Wed, 14 Aug 2024 14:37:31 +0800
|
||||||
|
Subject: [PATCH 09/10] x86: Add `Avoid_STOSB` tunable to allow NT memset
|
||||||
|
without ERMS
|
||||||
|
|
||||||
|
The goal of this flag is to allow targets which don't prefer/have ERMS
|
||||||
|
to still access the non-temporal memset implementation.
|
||||||
|
|
||||||
|
There are 4 cases for tuning memset:
|
||||||
|
1) `Avoid_STOSB && Avoid_Non_Temporal_Memset`
|
||||||
|
- Memset with temporal stores
|
||||||
|
2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset`
|
||||||
|
- Memset with temporal/non-temporal stores. Non-temporal path
|
||||||
|
goes through `rep stosb` path. We accomplish this by setting
|
||||||
|
`x86_rep_stosb_threshold` to
|
||||||
|
`x86_memset_non_temporal_threshold`.
|
||||||
|
3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset`
|
||||||
|
- Memset with temporal stores/`rep stosb`
|
||||||
|
3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset`
|
||||||
|
- Memset with temporal stores/`rep stosb`/non-temporal stores.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/cpu-features.c | 4 +++
|
||||||
|
sysdeps/x86/cpu-tunables.c | 2 ++
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 34 ++++++++++++++++---
|
||||||
|
...cpu-features-preferred_feature_index_1.def | 1 +
|
||||||
|
sysdeps/x86/tst-hwcap-tunables.c | 6 ++--
|
||||||
|
sysdeps/x86_64/multiarch/ifunc-memset.h | 18 +++++++---
|
||||||
|
6 files changed, 53 insertions(+), 12 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||||
|
index c9f2297524..287edc5b08 100644
|
||||||
|
--- a/sysdeps/x86/cpu-features.c
|
||||||
|
+++ b/sysdeps/x86/cpu-features.c
|
||||||
|
@@ -1014,6 +1014,10 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||||
|
if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
|
||||||
|
cpu_features->preferred[index_arch_I686] |= bit_arch_I686;
|
||||||
|
|
||||||
|
+ /* No ERMS, we want to avoid stosb for memset. */
|
||||||
|
+ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;
|
||||||
|
+
|
||||||
|
#if !HAS_CPUID
|
||||||
|
no_cpuid:
|
||||||
|
#endif
|
||||||
|
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||||
|
index b8475730ea..a4bbf13080 100644
|
||||||
|
--- a/sysdeps/x86/cpu-tunables.c
|
||||||
|
+++ b/sysdeps/x86/cpu-tunables.c
|
||||||
|
@@ -214,6 +214,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||||
|
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||||
|
Prefer_FSRM,
|
||||||
|
disable, 11);
|
||||||
|
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Avoid_STOSB,
|
||||||
|
+ disable, 11);
|
||||||
|
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features,
|
||||||
|
Slow_SSE4_2,
|
||||||
|
SSE4_2,
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index d8288f0b0c..5803bfcea8 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1096,18 +1096,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
|
||||||
|
long int, NULL);
|
||||||
|
|
||||||
|
+ /*
|
||||||
|
+ For memset, the non-temporal implementation is only accessed through the
|
||||||
|
+ stosb code. ie:
|
||||||
|
+ ```
|
||||||
|
+ if (size >= rep_stosb_thresh)
|
||||||
|
+ {
|
||||||
|
+ if (size >= non_temporal_thresh)
|
||||||
|
+ {
|
||||||
|
+ do_non_temporal ();
|
||||||
|
+ }
|
||||||
|
+ do_stosb ();
|
||||||
|
+ }
|
||||||
|
+ do_normal_vec_loop ();
|
||||||
|
+ ```
|
||||||
|
+ So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
|
||||||
|
+ to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
|
||||||
|
+ `rep stosb` will never be used.
|
||||||
|
+ */
|
||||||
|
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
|
||||||
|
+ memset_non_temporal_threshold,
|
||||||
|
+ minimum_non_temporal_threshold, SIZE_MAX);
|
||||||
|
+ /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the
|
||||||
|
+ final value of `x86_memset_non_temporal_threshold`. In some cases this can
|
||||||
|
+ be a matter of correctness. */
|
||||||
|
+ if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB))
|
||||||
|
+ rep_stosb_threshold
|
||||||
|
+ = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
|
||||||
|
+ TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
||||||
|
+ SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||||
|
minimum_non_temporal_threshold,
|
||||||
|
maximum_non_temporal_threshold);
|
||||||
|
- TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
|
||||||
|
- memset_non_temporal_threshold,
|
||||||
|
- minimum_non_temporal_threshold, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||||
|
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||||
|
- TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
||||||
|
- SIZE_MAX);
|
||||||
|
|
||||||
|
unsigned long int rep_movsb_stop_threshold;
|
||||||
|
/* Setting the upper bound of ERMS to the computed value of
|
||||||
|
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||||
|
index aae1c85551..38a0c9226c 100644
|
||||||
|
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||||
|
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||||
|
@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512)
|
||||||
|
BIT (Prefer_FSRM)
|
||||||
|
BIT (Avoid_Short_Distance_REP_MOVSB)
|
||||||
|
BIT (Avoid_Non_Temporal_Memset)
|
||||||
|
+BIT (Avoid_STOSB)
|
||||||
|
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
|
||||||
|
index 94307283d7..1920f5057e 100644
|
||||||
|
--- a/sysdeps/x86/tst-hwcap-tunables.c
|
||||||
|
+++ b/sysdeps/x86/tst-hwcap-tunables.c
|
||||||
|
@@ -60,7 +60,8 @@ static const struct test_t
|
||||||
|
/* Disable everything. */
|
||||||
|
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||||||
|
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
|
||||||
|
- "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
|
||||||
|
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
|
||||||
|
+ "-Avoid_STOSB",
|
||||||
|
test_1,
|
||||||
|
array_length (test_1)
|
||||||
|
},
|
||||||
|
@@ -68,7 +69,8 @@ static const struct test_t
|
||||||
|
/* Same as before, but with some empty suboptions. */
|
||||||
|
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||||||
|
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
|
||||||
|
- "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
|
||||||
|
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
|
||||||
|
+ "-Avoid_STOSB,-,",
|
||||||
|
test_1,
|
||||||
|
array_length (test_1)
|
||||||
|
}
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||||
|
index 5c5096ec5a..6b3b9a17a2 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||||
|
@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
|
||||||
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
|
||||||
|
attribute_hidden;
|
||||||
|
|
||||||
|
+static inline int
|
||||||
|
+prefer_erms_nt_impl (const struct cpu_features *cpu_features)
|
||||||
|
+{
|
||||||
|
+ return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
|
||||||
|
+ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static inline void *
|
||||||
|
IFUNC_SELECTOR (void)
|
||||||
|
{
|
||||||
|
@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void)
|
||||||
|
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||||
|
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||||
|
{
|
||||||
|
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ if (prefer_erms_nt_impl (cpu_features))
|
||||||
|
return OPTIMIZE (avx512_unaligned_erms);
|
||||||
|
|
||||||
|
return OPTIMIZE (avx512_unaligned);
|
||||||
|
@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void)
|
||||||
|
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||||
|
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||||
|
{
|
||||||
|
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ if (prefer_erms_nt_impl (cpu_features))
|
||||||
|
return OPTIMIZE (evex_unaligned_erms);
|
||||||
|
|
||||||
|
return OPTIMIZE (evex_unaligned);
|
||||||
|
@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void)
|
||||||
|
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||||
|
{
|
||||||
|
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ if (prefer_erms_nt_impl (cpu_features))
|
||||||
|
return OPTIMIZE (avx2_unaligned_erms_rtm);
|
||||||
|
|
||||||
|
return OPTIMIZE (avx2_unaligned_rtm);
|
||||||
|
@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void)
|
||||||
|
if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
|
||||||
|
Prefer_No_VZEROUPPER, !))
|
||||||
|
{
|
||||||
|
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ if (prefer_erms_nt_impl (cpu_features))
|
||||||
|
return OPTIMIZE (avx2_unaligned_erms);
|
||||||
|
|
||||||
|
return OPTIMIZE (avx2_unaligned);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
|
||||||
|
+ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
||||||
|
return OPTIMIZE (sse2_unaligned_erms);
|
||||||
|
|
||||||
|
return OPTIMIZE (sse2_unaligned);
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -0,0 +1,95 @@
|
|||||||
|
From 01b5cac929a3be361dd575bed6673c40a25a6d61 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Wed, 14 Aug 2024 14:37:30 +0800
|
||||||
|
Subject: [PATCH 08/10] x86: Use `Avoid_Non_Temporal_Memset` to control
|
||||||
|
non-temporal path
|
||||||
|
|
||||||
|
This is just a refactor and there should be no behavioral change from
|
||||||
|
this commit.
|
||||||
|
|
||||||
|
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
|
||||||
|
for controlling whether we use non-temporal memset rather than having
|
||||||
|
extra logic based on vendor.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
|
||||||
|
2 files changed, 23 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||||
|
index b4030776a7..c9f2297524 100644
|
||||||
|
--- a/sysdeps/x86/cpu-features.c
|
||||||
|
+++ b/sysdeps/x86/cpu-features.c
|
||||||
|
@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||||
|
unsigned int stepping = 0;
|
||||||
|
enum cpu_features_kind kind;
|
||||||
|
|
||||||
|
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
|
||||||
|
+ as of writing this, we only have benchmarks indicatings it profitability
|
||||||
|
+ on Intel/AMD. */
|
||||||
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
+
|
||||||
|
cpu_features->cachesize_non_temporal_divisor = 4;
|
||||||
|
#if !HAS_CPUID
|
||||||
|
if (__get_cpuid_max (0, 0) == 0)
|
||||||
|
@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||||
|
|
||||||
|
update_active (cpu_features);
|
||||||
|
|
||||||
|
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
|
||||||
|
+ hardware. */
|
||||||
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
+
|
||||||
|
if (family == 0x06)
|
||||||
|
{
|
||||||
|
model += extended_model;
|
||||||
|
@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||||
|
|
||||||
|
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
|
||||||
|
|
||||||
|
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
|
||||||
|
+ hardware. */
|
||||||
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
+
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
||||||
|
{
|
||||||
|
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index de4584116f..d8288f0b0c 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1048,14 +1048,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
|
- /* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||||
|
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||||
|
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
||||||
|
- && (cpu_features->basic.kind == arch_kind_intel
|
||||||
|
- || cpu_features->basic.kind == arch_kind_amd))
|
||||||
|
- memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
-
|
||||||
|
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
cases slower than the vectorized path (and for some alignments,
|
||||||
|
it is really slow, check BZ #30994). */
|
||||||
|
@@ -1077,6 +1069,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (tunable_size != 0)
|
||||||
|
shared = tunable_size;
|
||||||
|
|
||||||
|
+ /* Non-temporal stores are more performant on some hardware above
|
||||||
|
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
|
||||||
|
+ Intel and AMD hardware. */
|
||||||
|
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
||||||
|
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
+
|
||||||
|
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
|
||||||
|
if (tunable_size > minimum_non_temporal_threshold
|
||||||
|
&& tunable_size <= maximum_non_temporal_threshold)
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -0,0 +1,192 @@
|
|||||||
|
From aa3d7bd5299b33bffc118aa618b59bfa66059bcb Mon Sep 17 00:00:00 2001
|
||||||
|
From: Florian Weimer <fweimer@redhat.com>
|
||||||
|
Date: Thu, 13 Feb 2025 21:56:52 +0100
|
||||||
|
Subject: [PATCH] elf: Keep using minimal malloc after early DTV resize (bug
|
||||||
|
32412)
|
||||||
|
|
||||||
|
If an auditor loads many TLS-using modules during startup, it is
|
||||||
|
possible to trigger DTV resizing. Previously, the DTV was marked
|
||||||
|
as allocated by the main malloc afterwards, even if the minimal
|
||||||
|
malloc was still in use. With this change, _dl_resize_dtv marks
|
||||||
|
the resized DTV as allocated with the minimal malloc.
|
||||||
|
|
||||||
|
The new test reuses TLS-using modules from other auditing tests.
|
||||||
|
|
||||||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||||
|
---
|
||||||
|
elf/Makefile | 5 +++
|
||||||
|
elf/dl-tls.c | 7 ++++
|
||||||
|
elf/tst-audit-tlsdesc-dlopen2.c | 46 +++++++++++++++++++++++++
|
||||||
|
elf/tst-auditmod-tlsdesc2.c | 59 +++++++++++++++++++++++++++++++++
|
||||||
|
4 files changed, 117 insertions(+)
|
||||||
|
create mode 100644 elf/tst-audit-tlsdesc-dlopen2.c
|
||||||
|
create mode 100644 elf/tst-auditmod-tlsdesc2.c
|
||||||
|
|
||||||
|
diff --git a/elf/Makefile b/elf/Makefile
|
||||||
|
index 5c833871d0..1ea0e7037e 100644
|
||||||
|
--- a/elf/Makefile
|
||||||
|
+++ b/elf/Makefile
|
||||||
|
@@ -379,6 +379,7 @@ tests += \
|
||||||
|
tst-align3 \
|
||||||
|
tst-audit-tlsdesc \
|
||||||
|
tst-audit-tlsdesc-dlopen \
|
||||||
|
+ tst-audit-tlsdesc-dlopen2 \
|
||||||
|
tst-audit1 \
|
||||||
|
tst-audit2 \
|
||||||
|
tst-audit8 \
|
||||||
|
@@ -863,6 +864,7 @@ modules-names += \
|
||||||
|
tst-auditmanymod8 \
|
||||||
|
tst-auditmanymod9 \
|
||||||
|
tst-auditmod-tlsdesc \
|
||||||
|
+ tst-auditmod-tlsdesc2 \
|
||||||
|
tst-auditmod1 \
|
||||||
|
tst-auditmod11 \
|
||||||
|
tst-auditmod12 \
|
||||||
|
@@ -3189,6 +3191,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
|
||||||
|
tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
|
||||||
|
$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
|
||||||
|
tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
|
||||||
|
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
|
||||||
|
+ $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
|
||||||
|
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
|
||||||
|
|
||||||
|
$(objpfx)tst-dlmopen-twice.out: \
|
||||||
|
$(objpfx)tst-dlmopen-twice-mod1.so \
|
||||||
|
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
|
||||||
|
index 5178d9b66a..a083a82933 100644
|
||||||
|
--- a/elf/dl-tls.c
|
||||||
|
+++ b/elf/dl-tls.c
|
||||||
|
@@ -566,6 +566,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
|
||||||
|
if (newp == NULL)
|
||||||
|
oom ();
|
||||||
|
memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
|
||||||
|
+#ifdef SHARED
|
||||||
|
+ /* Auditors can trigger a DTV resize event while the full malloc
|
||||||
|
+ is not yet in use. Mark the new DTV allocation as the
|
||||||
|
+ initial allocation. */
|
||||||
|
+ if (!__rtld_malloc_is_complete ())
|
||||||
|
+ GL(dl_initial_dtv) = &newp[1];
|
||||||
|
+#endif
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..7ba2c4129a
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
|
||||||
|
@@ -0,0 +1,46 @@
|
||||||
|
+/* Loading TLS-using modules from auditors (bug 32412). Main program.
|
||||||
|
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <support/xdlfcn.h>
|
||||||
|
+#include <stdio.h>
|
||||||
|
+
|
||||||
|
+static int
|
||||||
|
+do_test (void)
|
||||||
|
+{
|
||||||
|
+ puts ("info: start of main program");
|
||||||
|
+
|
||||||
|
+ /* Load TLS-using modules, to trigger DTV resizing. The dynamic
|
||||||
|
+ linker will load them again (requiring their own TLS) because the
|
||||||
|
+ dlopen calls from the auditor were in the auditing namespace. */
|
||||||
|
+ for (int i = 1; i <= 19; ++i)
|
||||||
|
+ {
|
||||||
|
+ char dso[30];
|
||||||
|
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
|
||||||
|
+ char sym[30];
|
||||||
|
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
|
||||||
|
+
|
||||||
|
+ void *handle = xdlopen (dso, RTLD_LAZY);
|
||||||
|
+ int (*func) (void) = xdlsym (handle, sym);
|
||||||
|
+ /* Trigger TLS allocation. */
|
||||||
|
+ func ();
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#include <support/test-driver.c>
|
||||||
|
diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..50275cd34d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/elf/tst-auditmod-tlsdesc2.c
|
||||||
|
@@ -0,0 +1,59 @@
|
||||||
|
+/* Loading TLS-using modules from auditors (bug 32412). Audit module.
|
||||||
|
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <dlfcn.h>
|
||||||
|
+#include <link.h>
|
||||||
|
+#include <stdbool.h>
|
||||||
|
+#include <stdio.h>
|
||||||
|
+#include <unistd.h>
|
||||||
|
+
|
||||||
|
+unsigned int
|
||||||
|
+la_version (unsigned int version)
|
||||||
|
+{
|
||||||
|
+ /* Open some modules, to trigger DTV resizing before the switch to
|
||||||
|
+ the main malloc. */
|
||||||
|
+ for (int i = 1; i <= 19; ++i)
|
||||||
|
+ {
|
||||||
|
+ char dso[30];
|
||||||
|
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
|
||||||
|
+ char sym[30];
|
||||||
|
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
|
||||||
|
+
|
||||||
|
+ void *handle = dlopen (dso, RTLD_LAZY);
|
||||||
|
+ if (handle == NULL)
|
||||||
|
+ {
|
||||||
|
+ printf ("error: dlmopen from auditor: %s\n", dlerror ());
|
||||||
|
+ fflush (stdout);
|
||||||
|
+ _exit (1);
|
||||||
|
+ }
|
||||||
|
+ int (*func) (void) = dlsym (handle, sym);
|
||||||
|
+ if (func == NULL)
|
||||||
|
+ {
|
||||||
|
+ printf ("error: dlsym from auditor: %s\n", dlerror ());
|
||||||
|
+ fflush (stdout);
|
||||||
|
+ _exit (1);
|
||||||
|
+ }
|
||||||
|
+ /* Trigger TLS allocation. */
|
||||||
|
+ func ();
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ puts ("info: TLS-using modules loaded from auditor");
|
||||||
|
+ fflush (stdout);
|
||||||
|
+
|
||||||
|
+ return LAV_CURRENT;
|
||||||
|
+}
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
@ -0,0 +1,97 @@
|
|||||||
|
From daa15a5bffc436cf7b943b306c85c90ce8bb369e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Feifei Wang <wangfeifei@hygon.cn>
|
||||||
|
Date: Mon, 19 Aug 2024 14:57:54 +0800
|
||||||
|
Subject: [PATCH 02/10] x86: Add cache information support for Hygon processors
|
||||||
|
|
||||||
|
Add hygon branch in dl_init_cacheinfo function to initialize
|
||||||
|
cache size variables for hygon processors. In the meanwhile,
|
||||||
|
add handle_hygon() function to get cache information.
|
||||||
|
|
||||||
|
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
|
||||||
|
Reviewed-by: Jing Li <lijing@hygon.cn>
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 60 ++++++++++++++++++++++++++++++++++++++
|
||||||
|
1 file changed, 60 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index 7b5ed210ca..85c404dd26 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -567,6 +567,48 @@ handle_zhaoxin (int name)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static long int __attribute__ ((noinline))
|
||||||
|
+handle_hygon (int name)
|
||||||
|
+{
|
||||||
|
+ unsigned int eax;
|
||||||
|
+ unsigned int ebx;
|
||||||
|
+ unsigned int ecx;
|
||||||
|
+ unsigned int edx;
|
||||||
|
+ unsigned int count = 0x1;
|
||||||
|
+
|
||||||
|
+ if (name >= _SC_LEVEL3_CACHE_SIZE)
|
||||||
|
+ count = 0x3;
|
||||||
|
+ else if (name >= _SC_LEVEL2_CACHE_SIZE)
|
||||||
|
+ count = 0x2;
|
||||||
|
+ else if (name >= _SC_LEVEL1_DCACHE_SIZE)
|
||||||
|
+ count = 0x0;
|
||||||
|
+
|
||||||
|
+ /* Use __cpuid__ '0x8000_001D' to compute cache details. */
|
||||||
|
+ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
|
||||||
|
+
|
||||||
|
+ switch (name)
|
||||||
|
+ {
|
||||||
|
+ case _SC_LEVEL1_ICACHE_ASSOC:
|
||||||
|
+ case _SC_LEVEL1_DCACHE_ASSOC:
|
||||||
|
+ case _SC_LEVEL2_CACHE_ASSOC:
|
||||||
|
+ case _SC_LEVEL3_CACHE_ASSOC:
|
||||||
|
+ return ((ebx >> 22) & 0x3ff) + 1;
|
||||||
|
+ case _SC_LEVEL1_ICACHE_LINESIZE:
|
||||||
|
+ case _SC_LEVEL1_DCACHE_LINESIZE:
|
||||||
|
+ case _SC_LEVEL2_CACHE_LINESIZE:
|
||||||
|
+ case _SC_LEVEL3_CACHE_LINESIZE:
|
||||||
|
+ return (ebx & 0xfff) + 1;
|
||||||
|
+ case _SC_LEVEL1_ICACHE_SIZE:
|
||||||
|
+ case _SC_LEVEL1_DCACHE_SIZE:
|
||||||
|
+ case _SC_LEVEL2_CACHE_SIZE:
|
||||||
|
+ case _SC_LEVEL3_CACHE_SIZE:
|
||||||
|
+ return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
|
||||||
|
+ default:
|
||||||
|
+ __builtin_unreachable ();
|
||||||
|
+ }
|
||||||
|
+ return -1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static void
|
||||||
|
get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
|
||||||
|
long int core)
|
||||||
|
@@ -890,6 +932,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
|
||||||
|
shared_per_thread = shared;
|
||||||
|
}
|
||||||
|
+ else if (cpu_features->basic.kind == arch_kind_hygon)
|
||||||
|
+ {
|
||||||
|
+ data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
+ shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
+ shared_per_thread = shared;
|
||||||
|
+
|
||||||
|
+ level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE);
|
||||||
|
+ level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE);
|
||||||
|
+ level1_dcache_size = data;
|
||||||
|
+ level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC);
|
||||||
|
+ level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE);
|
||||||
|
+ level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);;
|
||||||
|
+ level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC);
|
||||||
|
+ level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE);
|
||||||
|
+ level3_cache_size = shared;
|
||||||
|
+ level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC);
|
||||||
|
+ level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE);
|
||||||
|
+ }
|
||||||
|
|
||||||
|
cpu_features->level1_icache_size = level1_icache_size;
|
||||||
|
cpu_features->level1_icache_linesize = level1_icache_linesize;
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -0,0 +1,69 @@
|
|||||||
|
From 3215d6157f5f94706aa5db6783838885a8a3c4f1 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Feifei Wang <wangfeifei@hygon.cn>
|
||||||
|
Date: Mon, 19 Aug 2024 14:57:53 +0800
|
||||||
|
Subject: [PATCH 01/10] x86: Add new architecture type for Hygon processors
|
||||||
|
|
||||||
|
Add a new architecture type arch_kind_hygon to spilt Hygon branch
|
||||||
|
from AMD. This is to facilitate the Hygon processors to make settings
|
||||||
|
that are suitable for its own characteristics.
|
||||||
|
|
||||||
|
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
|
||||||
|
Reviewed-by: Jing Li <lijing@hygon.cn>
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/cpu-features.c | 19 ++++++++++++++++---
|
||||||
|
sysdeps/x86/include/cpu-features.h | 1 +
|
||||||
|
2 files changed, 17 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||||
|
index f752ebd24d..c4dd85145e 100644
|
||||||
|
--- a/sysdeps/x86/cpu-features.c
|
||||||
|
+++ b/sysdeps/x86/cpu-features.c
|
||||||
|
@@ -851,9 +851,8 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||||
|
cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
|
||||||
|
|= bit_arch_Avoid_Short_Distance_REP_MOVSB;
|
||||||
|
}
|
||||||
|
- /* This spells out "AuthenticAMD" or "HygonGenuine". */
|
||||||
|
- else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
||||||
|
- || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
|
||||||
|
+ /* This spells out "AuthenticAMD". */
|
||||||
|
+ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
||||||
|
{
|
||||||
|
unsigned int extended_model;
|
||||||
|
|
||||||
|
@@ -963,6 +962,20 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
+ /* This spells out "HygonGenuine". */
|
||||||
|
+ else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
|
||||||
|
+ {
|
||||||
|
+ unsigned int extended_model;
|
||||||
|
+
|
||||||
|
+ kind = arch_kind_hygon;
|
||||||
|
+
|
||||||
|
+ get_common_indices (cpu_features, &family, &model, &extended_model,
|
||||||
|
+ &stepping);
|
||||||
|
+
|
||||||
|
+ get_extended_indices (cpu_features);
|
||||||
|
+
|
||||||
|
+ update_active (cpu_features);
|
||||||
|
+ }
|
||||||
|
else
|
||||||
|
{
|
||||||
|
kind = arch_kind_other;
|
||||||
|
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
|
||||||
|
index eb30d342a6..594feeb2f4 100644
|
||||||
|
--- a/sysdeps/x86/include/cpu-features.h
|
||||||
|
+++ b/sysdeps/x86/include/cpu-features.h
|
||||||
|
@@ -856,6 +856,7 @@ enum cpu_features_kind
|
||||||
|
arch_kind_intel,
|
||||||
|
arch_kind_amd,
|
||||||
|
arch_kind_zhaoxin,
|
||||||
|
+ arch_kind_hygon,
|
||||||
|
arch_kind_other
|
||||||
|
};
|
||||||
|
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
211
backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
Normal file
211
backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
From 4ad2c9d04b76d7c4a42d80a82c022cd60b43b8b2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Fri, 24 May 2024 12:38:51 -0500
|
||||||
|
Subject: [PATCH 04/10] x86: Add seperate non-temporal tunable for memset
|
||||||
|
|
||||||
|
The tuning for non-temporal stores for memset vs memcpy is not always
|
||||||
|
the same. This includes both the exact value and whether non-temporal
|
||||||
|
stores are profitable at all for a given arch.
|
||||||
|
|
||||||
|
This patch add `x86_memset_non_temporal_threshold`. Currently we
|
||||||
|
disable non-temporal stores for non Intel vendors as the only
|
||||||
|
benchmarks showing its benefit have been on Intel hardware.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
manual/tunables.texi | 16 +++++++++++++++-
|
||||||
|
sysdeps/x86/cacheinfo.h | 8 +++++++-
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
|
||||||
|
sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
|
||||||
|
sysdeps/x86/dl-tunables.list | 3 +++
|
||||||
|
sysdeps/x86/include/cpu-features.h | 4 +++-
|
||||||
|
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
|
||||||
|
7 files changed, 49 insertions(+), 6 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/manual/tunables.texi b/manual/tunables.texi
|
||||||
|
index 6493904bae..2a2877884c 100644
|
||||||
|
--- a/manual/tunables.texi
|
||||||
|
+++ b/manual/tunables.texi
|
||||||
|
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
|
||||||
|
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
|
||||||
|
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
|
||||||
|
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
||||||
|
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
||||||
|
glibc.cpu.x86_shstk:
|
||||||
|
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
|
||||||
|
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
|
||||||
|
@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages.
|
||||||
|
@cindex shared_cache_size tunables
|
||||||
|
@cindex tunables, shared_cache_size
|
||||||
|
@cindex non_temporal_threshold tunables
|
||||||
|
-@cindex tunables, non_temporal_threshold
|
||||||
|
+@cindex memset_non_temporal_threshold tunables
|
||||||
|
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
|
||||||
|
|
||||||
|
@deftp {Tunable namespace} glibc.cpu
|
||||||
|
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
|
||||||
|
@@ -562,6 +564,18 @@ like memmove and memcpy.
|
||||||
|
This tunable is specific to i386 and x86-64.
|
||||||
|
@end deftp
|
||||||
|
|
||||||
|
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
|
||||||
|
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
|
||||||
|
+the user to set threshold in bytes for non temporal store in
|
||||||
|
+memset. Non temporal stores give a hint to the hardware to move data
|
||||||
|
+directly to memory without displacing other data from the cache. This
|
||||||
|
+tunable is used by some platforms to determine when to use non
|
||||||
|
+temporal stores memset.
|
||||||
|
+
|
||||||
|
+This tunable is specific to i386 and x86-64.
|
||||||
|
+@end deftp
|
||||||
|
+
|
||||||
|
+
|
||||||
|
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
|
||||||
|
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
|
||||||
|
set threshold in bytes to start using "rep movsb". The value must be
|
||||||
|
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
||||||
|
index ec1bc142c4..fd2b2ae66b 100644
|
||||||
|
--- a/sysdeps/x86/cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/cacheinfo.h
|
||||||
|
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
|
||||||
|
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
||||||
|
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
|
||||||
|
|
||||||
|
-/* Threshold to use non temporal store. */
|
||||||
|
+/* Threshold to use non temporal store in memmove. */
|
||||||
|
long int __x86_shared_non_temporal_threshold attribute_hidden;
|
||||||
|
|
||||||
|
+/* Threshold to use non temporal store in memset. */
|
||||||
|
+long int __x86_memset_non_temporal_threshold attribute_hidden;
|
||||||
|
+
|
||||||
|
/* Threshold to use Enhanced REP MOVSB. */
|
||||||
|
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
||||||
|
|
||||||
|
@@ -77,6 +80,9 @@ init_cacheinfo (void)
|
||||||
|
__x86_shared_non_temporal_threshold
|
||||||
|
= cpu_features->non_temporal_threshold;
|
||||||
|
|
||||||
|
+ __x86_memset_non_temporal_threshold
|
||||||
|
+ = cpu_features->memset_non_temporal_threshold;
|
||||||
|
+
|
||||||
|
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
||||||
|
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
|
||||||
|
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index ce2e6927e4..9f27da21ce 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1048,6 +1048,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
|
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
|
||||||
|
+ Until we benchmark data on other x86 processor, disable non-temporal
|
||||||
|
+ stores in memset. */
|
||||||
|
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
+ if (cpu_features->basic.kind == arch_kind_intel)
|
||||||
|
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
+
|
||||||
|
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
cases slower than the vectorized path (and for some alignments,
|
||||||
|
it is really slow, check BZ #30994). */
|
||||||
|
@@ -1074,6 +1081,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
&& tunable_size <= maximum_non_temporal_threshold)
|
||||||
|
non_temporal_threshold = tunable_size;
|
||||||
|
|
||||||
|
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
|
||||||
|
+ if (tunable_size > minimum_non_temporal_threshold
|
||||||
|
+ && tunable_size <= maximum_non_temporal_threshold)
|
||||||
|
+ memset_non_temporal_threshold = tunable_size;
|
||||||
|
+
|
||||||
|
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
|
||||||
|
if (tunable_size > minimum_rep_movsb_threshold)
|
||||||
|
rep_movsb_threshold = tunable_size;
|
||||||
|
@@ -1089,6 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||||
|
minimum_non_temporal_threshold,
|
||||||
|
maximum_non_temporal_threshold);
|
||||||
|
+ TUNABLE_SET_WITH_BOUNDS (
|
||||||
|
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
|
||||||
|
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||||
|
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
||||||
|
@@ -1102,6 +1117,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
cpu_features->data_cache_size = data;
|
||||||
|
cpu_features->shared_cache_size = shared;
|
||||||
|
cpu_features->non_temporal_threshold = non_temporal_threshold;
|
||||||
|
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
|
||||||
|
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
|
||||||
|
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
|
||||||
|
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
|
||||||
|
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
|
||||||
|
index 5aab63e532..05d54b5eba 100644
|
||||||
|
--- a/sysdeps/x86/dl-diagnostics-cpu.c
|
||||||
|
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
|
||||||
|
@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void)
|
||||||
|
cpu_features->shared_cache_size);
|
||||||
|
print_cpu_features_value ("non_temporal_threshold",
|
||||||
|
cpu_features->non_temporal_threshold);
|
||||||
|
+ print_cpu_features_value ("memset_non_temporal_threshold",
|
||||||
|
+ cpu_features->memset_non_temporal_threshold);
|
||||||
|
print_cpu_features_value ("rep_movsb_threshold",
|
||||||
|
cpu_features->rep_movsb_threshold);
|
||||||
|
print_cpu_features_value ("rep_movsb_stop_threshold",
|
||||||
|
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
|
||||||
|
index d1442d88ba..53852d6a07 100644
|
||||||
|
--- a/sysdeps/x86/dl-tunables.list
|
||||||
|
+++ b/sysdeps/x86/dl-tunables.list
|
||||||
|
@@ -30,6 +30,9 @@ glibc {
|
||||||
|
x86_non_temporal_threshold {
|
||||||
|
type: SIZE_T
|
||||||
|
}
|
||||||
|
+ x86_memset_non_temporal_threshold {
|
||||||
|
+ type: SIZE_T
|
||||||
|
+ }
|
||||||
|
x86_rep_movsb_threshold {
|
||||||
|
type: SIZE_T
|
||||||
|
# Since there is overhead to set up REP MOVSB operation, REP
|
||||||
|
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
|
||||||
|
index 594feeb2f4..e2d641dcd0 100644
|
||||||
|
--- a/sysdeps/x86/include/cpu-features.h
|
||||||
|
+++ b/sysdeps/x86/include/cpu-features.h
|
||||||
|
@@ -918,8 +918,10 @@ struct cpu_features
|
||||||
|
/* Shared cache size for use in memory and string routines, typically
|
||||||
|
L2 or L3 size. */
|
||||||
|
unsigned long int shared_cache_size;
|
||||||
|
- /* Threshold to use non temporal store. */
|
||||||
|
+ /* Threshold to use non temporal store in memmove. */
|
||||||
|
unsigned long int non_temporal_threshold;
|
||||||
|
+ /* Threshold to use non temporal store in memset. */
|
||||||
|
+ unsigned long int memset_non_temporal_threshold;
|
||||||
|
/* Threshold to use "rep movsb". */
|
||||||
|
unsigned long int rep_movsb_threshold;
|
||||||
|
/* Threshold to stop using "rep movsb". */
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
index aba45e3da0..d95750b516 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
@@ -24,9 +24,9 @@
|
||||||
|
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
||||||
|
4 VEC stores and store 4 * VEC at a time until done.
|
||||||
|
6. On machines ERMS feature, if size is range
|
||||||
|
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
|
||||||
|
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
|
||||||
|
then REP STOSB will be used.
|
||||||
|
- 7. If size >= __x86_shared_non_temporal_threshold, use a
|
||||||
|
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
|
||||||
|
non-temporal stores. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
@@ -318,7 +318,7 @@ L(return_vzeroupper):
|
||||||
|
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||||
|
range for 2-byte jump encoding. */
|
||||||
|
L(stosb_local):
|
||||||
|
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||||
|
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
|
||||||
|
jae L(nt_memset)
|
||||||
|
movzbl %sil, %eax
|
||||||
|
mov %RDX_LP, %RCX_LP
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
263
backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
Normal file
263
backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
Normal file
@ -0,0 +1,263 @@
|
|||||||
|
From ce7c6c491ed0750a10f9a52b5edc710d978e70e2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Mon, 15 Jul 2024 16:19:17 +0800
|
||||||
|
Subject: [PATCH 07/10] x86: Disable non-temporal memset on Skylake Server
|
||||||
|
|
||||||
|
The original commit enabling non-temporal memset on Skylake Server had
|
||||||
|
erroneous benchmarks (actually done on ICX).
|
||||||
|
|
||||||
|
Further benchmarks indicate non-temporal stores may in fact by a
|
||||||
|
regression on Skylake Server.
|
||||||
|
|
||||||
|
This commit may be over-cautious in some cases, but should avoid any
|
||||||
|
regressions for 2.40.
|
||||||
|
|
||||||
|
Tested using qemu on all x86_64 cpu arch supported by both qemu +
|
||||||
|
GLIBC.
|
||||||
|
|
||||||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/cpu-features.c | 13 +-
|
||||||
|
sysdeps/x86/cpu-tunables.c | 6 +
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 15 +-
|
||||||
|
...cpu-features-preferred_feature_index_1.def | 1 +
|
||||||
|
sysdeps/x86/tst-hwcap-tunables.c | 148 ++++++++++++++++++
|
||||||
|
5 files changed, 173 insertions(+), 10 deletions(-)
|
||||||
|
create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||||
|
index c4dd85145e..b4030776a7 100644
|
||||||
|
--- a/sysdeps/x86/cpu-features.c
|
||||||
|
+++ b/sysdeps/x86/cpu-features.c
|
||||||
|
@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||||
|
|
||||||
|
/* Newer Bigcore microarch (larger non-temporal store
|
||||||
|
threshold). */
|
||||||
|
- case INTEL_BIGCORE_SKYLAKE:
|
||||||
|
- case INTEL_BIGCORE_KABYLAKE:
|
||||||
|
- case INTEL_BIGCORE_COMETLAKE:
|
||||||
|
case INTEL_BIGCORE_SKYLAKE_AVX512:
|
||||||
|
case INTEL_BIGCORE_CANNONLAKE:
|
||||||
|
+ /* Benchmarks indicate non-temporal memset is not
|
||||||
|
+ necessarily profitable on SKX (and in some cases much
|
||||||
|
+ worse). This is likely unique to SKX due its it unique
|
||||||
|
+ mesh interconnect (not present on ICX or BWD). Disable
|
||||||
|
+ non-temporal on all Skylake servers. */
|
||||||
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
+ case INTEL_BIGCORE_COMETLAKE:
|
||||||
|
+ case INTEL_BIGCORE_SKYLAKE:
|
||||||
|
+ case INTEL_BIGCORE_KABYLAKE:
|
||||||
|
case INTEL_BIGCORE_ICELAKE:
|
||||||
|
case INTEL_BIGCORE_TIGERLAKE:
|
||||||
|
case INTEL_BIGCORE_ROCKETLAKE:
|
||||||
|
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||||
|
index 0d4f328585..b8475730ea 100644
|
||||||
|
--- a/sysdeps/x86/cpu-tunables.c
|
||||||
|
+++ b/sysdeps/x86/cpu-tunables.c
|
||||||
|
@@ -272,6 +272,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||||
|
disable, 24);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
+ case 25:
|
||||||
|
+ {
|
||||||
|
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||||
|
+ Avoid_Non_Temporal_Memset,
|
||||||
|
+ disable, 25);
|
||||||
|
+ }
|
||||||
|
case 26:
|
||||||
|
{
|
||||||
|
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index a76df092e6..de4584116f 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1051,13 +1051,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
/* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||||
|
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||||
|
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
- if (cpu_features->basic.kind == arch_kind_intel
|
||||||
|
- || cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
- memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
-
|
||||||
|
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
- cases slower than the vectorized path (and for some alignments,
|
||||||
|
- it is really slow, check BZ #30994). */
|
||||||
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
||||||
|
+ && (cpu_features->basic.kind == arch_kind_intel
|
||||||
|
+ || cpu_features->basic.kind == arch_kind_amd))
|
||||||
|
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
+
|
||||||
|
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
+ cases slower than the vectorized path (and for some alignments,
|
||||||
|
+ it is really slow, check BZ #30994). */
|
||||||
|
if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
rep_movsb_threshold = non_temporal_threshold;
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||||
|
index d20c5b3196..aae1c85551 100644
|
||||||
|
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||||
|
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||||
|
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
|
||||||
|
BIT (MathVec_Prefer_No_AVX512)
|
||||||
|
BIT (Prefer_FSRM)
|
||||||
|
BIT (Avoid_Short_Distance_REP_MOVSB)
|
||||||
|
+BIT (Avoid_Non_Temporal_Memset)
|
||||||
|
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..94307283d7
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86/tst-hwcap-tunables.c
|
||||||
|
@@ -0,0 +1,148 @@
|
||||||
|
+/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
|
||||||
|
+ Copyright (C) 2023-2024 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <http://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <array_length.h>
|
||||||
|
+#include <getopt.h>
|
||||||
|
+#include <ifunc-impl-list.h>
|
||||||
|
+#include <spawn.h>
|
||||||
|
+#include <stdio.h>
|
||||||
|
+#include <stdlib.h>
|
||||||
|
+#include <string.h>
|
||||||
|
+#include <intprops.h>
|
||||||
|
+#include <support/check.h>
|
||||||
|
+#include <support/support.h>
|
||||||
|
+#include <support/xunistd.h>
|
||||||
|
+#include <support/capture_subprocess.h>
|
||||||
|
+
|
||||||
|
+/* Nonzero if the program gets called via `exec'. */
|
||||||
|
+#define CMDLINE_OPTIONS \
|
||||||
|
+ { "restart", no_argument, &restart, 1 },
|
||||||
|
+static int restart;
|
||||||
|
+
|
||||||
|
+/* Disable everything. */
|
||||||
|
+static const char *test_1[] =
|
||||||
|
+{
|
||||||
|
+ "__memcpy_avx512_no_vzeroupper",
|
||||||
|
+ "__memcpy_avx512_unaligned",
|
||||||
|
+ "__memcpy_avx512_unaligned_erms",
|
||||||
|
+ "__memcpy_evex_unaligned",
|
||||||
|
+ "__memcpy_evex_unaligned_erms",
|
||||||
|
+ "__memcpy_avx_unaligned",
|
||||||
|
+ "__memcpy_avx_unaligned_erms",
|
||||||
|
+ "__memcpy_avx_unaligned_rtm",
|
||||||
|
+ "__memcpy_avx_unaligned_erms_rtm",
|
||||||
|
+ "__memcpy_ssse3",
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+static const struct test_t
|
||||||
|
+{
|
||||||
|
+ const char *env;
|
||||||
|
+ const char *const *funcs;
|
||||||
|
+ size_t nfuncs;
|
||||||
|
+} tests[] =
|
||||||
|
+{
|
||||||
|
+ {
|
||||||
|
+ /* Disable everything. */
|
||||||
|
+ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||||||
|
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
|
||||||
|
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
|
||||||
|
+ test_1,
|
||||||
|
+ array_length (test_1)
|
||||||
|
+ },
|
||||||
|
+ {
|
||||||
|
+ /* Same as before, but with some empty suboptions. */
|
||||||
|
+ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||||||
|
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
|
||||||
|
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
|
||||||
|
+ test_1,
|
||||||
|
+ array_length (test_1)
|
||||||
|
+ }
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+/* Called on process re-execution. */
|
||||||
|
+_Noreturn static void
|
||||||
|
+handle_restart (int ntest)
|
||||||
|
+{
|
||||||
|
+ struct libc_ifunc_impl impls[32];
|
||||||
|
+ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
|
||||||
|
+ if (cnt == 0)
|
||||||
|
+ _exit (EXIT_SUCCESS);
|
||||||
|
+ TEST_VERIFY_EXIT (cnt >= 1);
|
||||||
|
+ for (int i = 0; i < cnt; i++)
|
||||||
|
+ {
|
||||||
|
+ for (int f = 0; f < tests[ntest].nfuncs; f++)
|
||||||
|
+ {
|
||||||
|
+ if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
|
||||||
|
+ TEST_COMPARE (impls[i].usable, false);
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ _exit (EXIT_SUCCESS);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int
|
||||||
|
+do_test (int argc, char *argv[])
|
||||||
|
+{
|
||||||
|
+ /* We must have either:
|
||||||
|
+ - One our fource parameters left if called initially:
|
||||||
|
+ + path to ld.so optional
|
||||||
|
+ + "--library-path" optional
|
||||||
|
+ + the library path optional
|
||||||
|
+ + the application name
|
||||||
|
+ + the test to check */
|
||||||
|
+
|
||||||
|
+ TEST_VERIFY_EXIT (argc == 2 || argc == 5);
|
||||||
|
+
|
||||||
|
+ if (restart)
|
||||||
|
+ handle_restart (atoi (argv[1]));
|
||||||
|
+
|
||||||
|
+ char nteststr[INT_BUFSIZE_BOUND (int)];
|
||||||
|
+
|
||||||
|
+ char *spargv[10];
|
||||||
|
+ {
|
||||||
|
+ int i = 0;
|
||||||
|
+ for (; i < argc - 1; i++)
|
||||||
|
+ spargv[i] = argv[i + 1];
|
||||||
|
+ spargv[i++] = (char *) "--direct";
|
||||||
|
+ spargv[i++] = (char *) "--restart";
|
||||||
|
+ spargv[i++] = nteststr;
|
||||||
|
+ spargv[i] = NULL;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ for (int i = 0; i < array_length (tests); i++)
|
||||||
|
+ {
|
||||||
|
+ snprintf (nteststr, sizeof nteststr, "%d", i);
|
||||||
|
+
|
||||||
|
+ printf ("[%d] Spawned test for %s\n", i, tests[i].env);
|
||||||
|
+ char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
|
||||||
|
+ setenv ("GLIBC_TUNABLES", tunable, 1);
|
||||||
|
+
|
||||||
|
+ struct support_capture_subprocess result
|
||||||
|
+ = support_capture_subprogram (spargv[0], spargv, NULL);
|
||||||
|
+ support_capture_subprocess_check (&result, "tst-tunables", 0,
|
||||||
|
+ sc_allow_stderr);
|
||||||
|
+ support_capture_subprocess_free (&result);
|
||||||
|
+
|
||||||
|
+ free (tunable);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#define TEST_FUNCTION_ARGV do_test
|
||||||
|
+#include <support/test-driver.c>
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -0,0 +1,92 @@
|
|||||||
|
From 1e57e1c6aa6ca5a476aba725271c1ace9be345d3 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Feifei Wang <wangfeifei@hygon.cn>
|
||||||
|
Date: Mon, 19 Aug 2024 14:57:55 +0800
|
||||||
|
Subject: [PATCH 10/10] x86: Enable non-temporal memset for Hygon processors
|
||||||
|
|
||||||
|
This patch uses 'Avoid_Non_Temporal_Memset' flag to access
|
||||||
|
the non-temporal memset implementation for hygon processors.
|
||||||
|
|
||||||
|
Test Results:
|
||||||
|
|
||||||
|
hygon1 arch
|
||||||
|
x86_memset_non_temporal_threshold = 8MB
|
||||||
|
size new performance time / old performance time
|
||||||
|
1MB 0.994
|
||||||
|
4MB 0.996
|
||||||
|
8MB 0.670
|
||||||
|
16MB 0.343
|
||||||
|
32MB 0.355
|
||||||
|
|
||||||
|
hygon2 arch
|
||||||
|
x86_memset_non_temporal_threshold = 8MB
|
||||||
|
size new performance time / old performance time
|
||||||
|
1MB 1
|
||||||
|
4MB 1
|
||||||
|
8MB 1.312
|
||||||
|
16MB 0.822
|
||||||
|
32MB 0.830
|
||||||
|
|
||||||
|
hygon3 arch
|
||||||
|
x86_memset_non_temporal_threshold = 8MB
|
||||||
|
size new performance time / old performance time
|
||||||
|
1MB 1
|
||||||
|
4MB 0.990
|
||||||
|
8MB 0.737
|
||||||
|
16MB 0.390
|
||||||
|
32MB 0.401
|
||||||
|
|
||||||
|
For hygon arch with this patch, non-temporal stores can improve
|
||||||
|
performance by 20% - 65%.
|
||||||
|
|
||||||
|
Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
|
||||||
|
Reviewed-by: Jing Li <lijing@hygon.cn>
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/cpu-features.c | 9 +++++++--
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 2 +-
|
||||||
|
2 files changed, 8 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||||
|
index 287edc5b08..f5539aea6f 100644
|
||||||
|
--- a/sysdeps/x86/cpu-features.c
|
||||||
|
+++ b/sysdeps/x86/cpu-features.c
|
||||||
|
@@ -640,9 +640,9 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||||
|
unsigned int stepping = 0;
|
||||||
|
enum cpu_features_kind kind;
|
||||||
|
|
||||||
|
- /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
|
||||||
|
+ /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
|
||||||
|
as of writing this, we only have benchmarks indicatings it profitability
|
||||||
|
- on Intel/AMD. */
|
||||||
|
+ on Intel/AMD/Hygon. */
|
||||||
|
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
|= bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
|
||||||
|
@@ -998,6 +998,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||||
|
get_extended_indices (cpu_features);
|
||||||
|
|
||||||
|
update_active (cpu_features);
|
||||||
|
+
|
||||||
|
+ /* Benchmarks indicate non-temporal memset can be profitable on Hygon
|
||||||
|
+ hardware. */
|
||||||
|
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||||
|
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index 5803bfcea8..d4dad8df3b 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
|
||||||
|
/* Non-temporal stores are more performant on some hardware above
|
||||||
|
non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
|
||||||
|
- Intel and AMD hardware. */
|
||||||
|
+ Intel, AMD and Hygon hardware. */
|
||||||
|
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
||||||
|
memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -0,0 +1,47 @@
|
|||||||
|
From 54e99a96ec3b97f53ee018bfa8dbbef2dd13f1e8 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Joe Damato <jdamato@fastly.com>
|
||||||
|
Date: Fri, 7 Jun 2024 23:04:47 +0000
|
||||||
|
Subject: [PATCH 05/10] x86: Enable non-temporal memset tunable for AMD
|
||||||
|
|
||||||
|
In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
|
||||||
|
memset") a tunable threshold for enabling non-temporal memset was added,
|
||||||
|
but only for Intel hardware.
|
||||||
|
|
||||||
|
Since that commit, new benchmark results suggest that non-temporal
|
||||||
|
memset is beneficial on AMD, as well, so allow this tunable to be set
|
||||||
|
for AMD.
|
||||||
|
|
||||||
|
See:
|
||||||
|
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
|
||||||
|
which has been updated to include data using different stategies for
|
||||||
|
large memset on AMD Zen2, Zen3, and Zen4.
|
||||||
|
|
||||||
|
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||||
|
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 8 ++++----
|
||||||
|
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index 9f27da21ce..dfdb4069c7 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1048,11 +1048,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
|
- /* Non-temporal stores in memset have only been tested on Intel hardware.
|
||||||
|
- Until we benchmark data on other x86 processor, disable non-temporal
|
||||||
|
- stores in memset. */
|
||||||
|
+ /* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||||
|
+ non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||||
|
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||||
|
- if (cpu_features->basic.kind == arch_kind_intel)
|
||||||
|
+ if (cpu_features->basic.kind == arch_kind_intel
|
||||||
|
+ || cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
memset_non_temporal_threshold = non_temporal_threshold;
|
||||||
|
|
||||||
|
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
149
backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
Normal file
149
backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
From f1ea6401d790764e4fcf02c6fb28e69841c25640 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
Date: Thu, 8 Feb 2024 10:08:38 -0300
|
||||||
|
Subject: [PATCH 03/10] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
|
||||||
|
|
||||||
|
The REP MOVSB usage on memcpy/memmove does not show much performance
|
||||||
|
improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
|
||||||
|
as from BZ 30994, if the source is aligned and the destination is not
|
||||||
|
the performance can be 20x slower.
|
||||||
|
|
||||||
|
The performance difference is noticeable with small buffer sizes, closer
|
||||||
|
to the lower bounds limits when memcpy/memmove starts to use ERMS. The
|
||||||
|
performance of REP MOVSB is similar to vectorized instruction on the
|
||||||
|
size limit (the L2 cache). Also, there is no drawback to multiple cores
|
||||||
|
sharing the cache.
|
||||||
|
|
||||||
|
Checked on x86_64-linux-gnu on Zen3.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
|
||||||
|
1 file changed, 18 insertions(+), 20 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index 85c404dd26..ce2e6927e4 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -833,7 +833,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
long int data = -1;
|
||||||
|
long int shared = -1;
|
||||||
|
long int shared_per_thread = -1;
|
||||||
|
- long int core = -1;
|
||||||
|
unsigned int threads = 0;
|
||||||
|
unsigned long int level1_icache_size = -1;
|
||||||
|
unsigned long int level1_icache_linesize = -1;
|
||||||
|
@@ -851,7 +850,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (cpu_features->basic.kind == arch_kind_intel)
|
||||||
|
{
|
||||||
|
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
|
||||||
|
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||||||
|
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
|
||||||
|
shared_per_thread = shared;
|
||||||
|
|
||||||
|
@@ -864,7 +862,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
|
||||||
|
level1_dcache_linesize
|
||||||
|
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
|
||||||
|
- level2_cache_size = core;
|
||||||
|
+ level2_cache_size
|
||||||
|
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||||||
|
level2_cache_assoc
|
||||||
|
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
|
||||||
|
level2_cache_linesize
|
||||||
|
@@ -877,12 +876,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
level4_cache_size
|
||||||
|
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
|
||||||
|
|
||||||
|
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||||||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
|
||||||
|
+ level2_cache_size);
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
|
||||||
|
{
|
||||||
|
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
shared_per_thread = shared;
|
||||||
|
|
||||||
|
@@ -891,19 +890,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
level1_dcache_size = data;
|
||||||
|
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
|
||||||
|
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
|
||||||
|
- level2_cache_size = core;
|
||||||
|
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
|
||||||
|
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
|
||||||
|
level3_cache_size = shared;
|
||||||
|
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
|
||||||
|
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
|
||||||
|
|
||||||
|
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||||||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
|
||||||
|
+ level2_cache_size);
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
{
|
||||||
|
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
|
||||||
|
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
|
||||||
|
@@ -911,7 +910,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
level1_dcache_size = data;
|
||||||
|
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
|
||||||
|
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
|
||||||
|
- level2_cache_size = core;
|
||||||
|
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
|
||||||
|
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
|
||||||
|
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
|
||||||
|
level3_cache_size = shared;
|
||||||
|
@@ -922,12 +921,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (shared <= 0)
|
||||||
|
{
|
||||||
|
/* No shared L3 cache. All we have is the L2 cache. */
|
||||||
|
- shared = core;
|
||||||
|
+ shared = level2_cache_size;
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.family < 0x17)
|
||||||
|
{
|
||||||
|
/* Account for exclusive L2 and L3 caches. */
|
||||||
|
- shared += core;
|
||||||
|
+ shared += level2_cache_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_per_thread = shared;
|
||||||
|
@@ -1049,6 +1048,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
|
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
+ cases slower than the vectorized path (and for some alignments,
|
||||||
|
+ it is really slow, check BZ #30994). */
|
||||||
|
+ if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
+ rep_movsb_threshold = non_temporal_threshold;
|
||||||
|
+
|
||||||
|
/* The default threshold to use Enhanced REP STOSB. */
|
||||||
|
unsigned long int rep_stosb_threshold = 2048;
|
||||||
|
|
||||||
|
@@ -1090,16 +1095,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
SIZE_MAX);
|
||||||
|
|
||||||
|
unsigned long int rep_movsb_stop_threshold;
|
||||||
|
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
|
||||||
|
- performing poorly for data above L2 cache size. Henceforth, adding
|
||||||
|
- an upper bound threshold parameter to limit the usage of Enhanced
|
||||||
|
- REP MOVSB operations and setting its value to L2 cache size. */
|
||||||
|
- if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
- rep_movsb_stop_threshold = core;
|
||||||
|
/* Setting the upper bound of ERMS to the computed value of
|
||||||
|
- non-temporal threshold for architectures other than AMD. */
|
||||||
|
- else
|
||||||
|
- rep_movsb_stop_threshold = non_temporal_threshold;
|
||||||
|
+ non-temporal threshold for all architectures. */
|
||||||
|
+ rep_movsb_stop_threshold = non_temporal_threshold;
|
||||||
|
|
||||||
|
cpu_features->data_cache_size = data;
|
||||||
|
cpu_features->shared_cache_size = shared;
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
@ -0,0 +1,41 @@
|
|||||||
|
From 8a2cea0ae0cbd4120770b81f0be422f60f020e17 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Fri, 14 Jun 2024 13:01:58 -0500
|
||||||
|
Subject: [PATCH 06/10] x86: Fix value for `x86_memset_non_temporal_threshold`
|
||||||
|
when it is undesirable
|
||||||
|
|
||||||
|
When we don't want to use non-temporal stores for memset, we set
|
||||||
|
`x86_memset_non_temporal_threshold` to SIZE_MAX.
|
||||||
|
|
||||||
|
The current code, however, we using `maximum_non_temporal_threshold`
|
||||||
|
as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
|
||||||
|
value of `0`.
|
||||||
|
|
||||||
|
Fix is to just use `SIZE_MAX` as the upper bound for when setting the
|
||||||
|
tunable.
|
||||||
|
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 6 +++---
|
||||||
|
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index dfdb4069c7..a76df092e6 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1101,9 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||||
|
minimum_non_temporal_threshold,
|
||||||
|
maximum_non_temporal_threshold);
|
||||||
|
- TUNABLE_SET_WITH_BOUNDS (
|
||||||
|
- x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
|
||||||
|
- minimum_non_temporal_threshold, maximum_non_temporal_threshold);
|
||||||
|
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
|
||||||
|
+ memset_non_temporal_threshold,
|
||||||
|
+ minimum_non_temporal_threshold, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||||
|
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
||||||
66
fix-CVE-2019-1010023.patch
Normal file
66
fix-CVE-2019-1010023.patch
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
From fe1ffef2eec9c6634a1e9af951eb68f0f5614470 Mon Sep 17 00:00:00 2001
|
||||||
|
From: xujing <xujing99@huawei.com>
|
||||||
|
Date: Thu, 2 Dec 2021 11:41:46 +0800
|
||||||
|
Subject: [PATCH] glibc: fix CVE-2019-1010023
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
| PT_LOAD
|
||||||
|
|
|
||||||
|
| […] Loadable segment entries in the program header table appear in
|
||||||
|
| ascending order, sorted on the p_vaddr member.
|
||||||
|
|
||||||
|
http://www.sco.com/developers/gabi/latest/ch5.pheader.html
|
||||||
|
|
||||||
|
Some check needed to fix vulnerability in load commands mapping reported by
|
||||||
|
|
||||||
|
https://sourceware.org/bugzilla/show_bug.cgi?id=22851
|
||||||
|
|
||||||
|
Signed-off-by: lvying <lvying6@huawei.com>
|
||||||
|
Signed-off-by: xujing <xujing99@huawei.com>
|
||||||
|
---
|
||||||
|
elf/dl-map-segments.h | 9 +++++++++
|
||||||
|
1 file changed, 9 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h
|
||||||
|
index 084076a2..a41ae73b 100644
|
||||||
|
--- a/elf/dl-map-segments.h
|
||||||
|
+++ b/elf/dl-map-segments.h
|
||||||
|
@@ -33,6 +33,7 @@ _dl_map_segments (struct link_map *l, int fd,
|
||||||
|
struct link_map *loader)
|
||||||
|
{
|
||||||
|
const struct loadcmd *c = loadcmds;
|
||||||
|
+ ElfW(Addr) l_map_end_aligned;
|
||||||
|
|
||||||
|
if (__glibc_likely (type == ET_DYN))
|
||||||
|
{
|
||||||
|
@@ -61,6 +62,8 @@ _dl_map_segments (struct link_map *l, int fd,
|
||||||
|
return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
|
||||||
|
|
||||||
|
l->l_map_end = l->l_map_start + maplength;
|
||||||
|
+ l_map_end_aligned = ((l->l_map_end + GLRO(dl_pagesize) - 1)
|
||||||
|
+ & ~(GLRO(dl_pagesize) - 1));
|
||||||
|
l->l_addr = l->l_map_start - c->mapstart;
|
||||||
|
|
||||||
|
if (has_holes)
|
||||||
|
@@ -85,10 +88,16 @@ _dl_map_segments (struct link_map *l, int fd,
|
||||||
|
/* Remember which part of the address space this object uses. */
|
||||||
|
l->l_map_start = c->mapstart + l->l_addr;
|
||||||
|
l->l_map_end = l->l_map_start + maplength;
|
||||||
|
+ l_map_end_aligned = ((l->l_map_end + GLRO(dl_pagesize) - 1)
|
||||||
|
+ & ~(GLRO(dl_pagesize) - 1));
|
||||||
|
l->l_contiguous = !has_holes;
|
||||||
|
|
||||||
|
while (c < &loadcmds[nloadcmds])
|
||||||
|
{
|
||||||
|
+ if ((l->l_addr + c->mapend) > l_map_end_aligned ||
|
||||||
|
+ (l->l_addr + c->mapstart) < l->l_map_start)
|
||||||
|
+ return DL_MAP_SEGMENTS_ERROR_MAP_SEGMENT;
|
||||||
|
+
|
||||||
|
if (c->mapend > c->mapstart
|
||||||
|
/* Map the segment contents from the file. */
|
||||||
|
&& (__mmap ((void *) (l->l_addr + c->mapstart),
|
||||||
|
--
|
||||||
|
2.23.0
|
||||||
|
|
||||||
59
glibc.spec
59
glibc.spec
@ -67,7 +67,7 @@
|
|||||||
##############################################################################
|
##############################################################################
|
||||||
Name: glibc
|
Name: glibc
|
||||||
Version: 2.38
|
Version: 2.38
|
||||||
Release: 52
|
Release: 57
|
||||||
Summary: The GNU libc libraries
|
Summary: The GNU libc libraries
|
||||||
License: %{all_license}
|
License: %{all_license}
|
||||||
URL: http://www.gnu.org/software/glibc/
|
URL: http://www.gnu.org/software/glibc/
|
||||||
@ -267,6 +267,25 @@ Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
|
|||||||
Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
|
Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
|
||||||
Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
|
Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
|
||||||
Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
|
Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
|
||||||
|
Patch181: backport-elf-Keep-using-minimal-malloc-after-early-DTV-resize.patch
|
||||||
|
Patch182: backport-x86-Add-new-architecture-type-for-Hygon-processors.patch
|
||||||
|
Patch183: backport-x86-Add-cache-information-support-for-Hygon-processo.patch
|
||||||
|
Patch184: backport-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
|
||||||
|
Patch185: backport-x86-Add-seperate-non-temporal-tunable-for-memset.patch
|
||||||
|
Patch186: backport-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
|
||||||
|
Patch187: backport-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
|
||||||
|
Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
|
||||||
|
Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
|
||||||
|
Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
|
||||||
|
Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
|
||||||
|
Patch192: assert-Add-test-for-CVE-2025-0395.patch
|
||||||
|
Patch193: AArch64-Improve-generic-strlen.patch
|
||||||
|
Patch194: AArch64-Optimize-memset.patch
|
||||||
|
Patch195: AArch64-Remove-zva_128-from-memset.patch
|
||||||
|
Patch196: math-Improve-layout-of-expf-data.patch
|
||||||
|
Patch197: AArch64-Add-SVE-memset.patch
|
||||||
|
Patch198: AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch
|
||||||
|
Patch199: math-Improve-layout-of-exp-exp10-data.patch
|
||||||
|
|
||||||
#openEuler patch list
|
#openEuler patch list
|
||||||
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
|
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
|
||||||
@ -310,6 +329,8 @@ Patch9034: 0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
|
|||||||
Patch9035: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
|
Patch9035: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
|
||||||
Patch9036: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
|
Patch9036: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
|
||||||
|
|
||||||
|
Patch9037: fix-CVE-2019-1010023.patch
|
||||||
|
|
||||||
Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
|
Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
|
||||||
|
|
||||||
BuildRequires: audit-libs-devel >= 1.1.3, sed >= 3.95, libcap-devel, gettext
|
BuildRequires: audit-libs-devel >= 1.1.3, sed >= 3.95, libcap-devel, gettext
|
||||||
@ -646,6 +667,7 @@ mkdir $builddir
|
|||||||
pushd $builddir
|
pushd $builddir
|
||||||
../configure CC="%GCC" CXX="%GXX" CFLAGS="$BuildFlags" LDFLAGS="$LinkFlags" \
|
../configure CC="%GCC" CXX="%GXX" CFLAGS="$BuildFlags" LDFLAGS="$LinkFlags" \
|
||||||
--prefix=%{_prefix} \
|
--prefix=%{_prefix} \
|
||||||
|
--enable-hardcoded-path-in-tests \
|
||||||
--with-headers=%{_prefix}/include $EnableKernel \
|
--with-headers=%{_prefix}/include $EnableKernel \
|
||||||
--with-nonshared-cflags=-Wp,-D_FORTIFY_SOURCE=2 \
|
--with-nonshared-cflags=-Wp,-D_FORTIFY_SOURCE=2 \
|
||||||
--enable-bind-now \
|
--enable-bind-now \
|
||||||
@ -1085,7 +1107,9 @@ function removeLoadPath()
|
|||||||
currPath=$(echo $runpathInfo | awk -F "RUNPATH=" '{print $2}')
|
currPath=$(echo $runpathInfo | awk -F "RUNPATH=" '{print $2}')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ x"$currPath" == x"\$ORIGIN" ]; then
|
# 2dcaf7064 using rpath instead of runpath with --enable-hardcoded-path-in-tests
|
||||||
|
# using "\$ORIGIN"* to match rpath address
|
||||||
|
if [[ x"$currPath" == x"\$ORIGIN"* ]]; then
|
||||||
chrpath -d $file
|
chrpath -d $file
|
||||||
|
|
||||||
findReliantLib $file
|
findReliantLib $file
|
||||||
@ -1486,6 +1510,37 @@ fi
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Fri Mar 28 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-57
|
||||||
|
- math: Improve layout of exp/exp10 data
|
||||||
|
- AArch64: Use prefer_sve_ifuncs for SVE memset
|
||||||
|
- AArch64: Add SVE memset
|
||||||
|
- math: Improve layout of expf data
|
||||||
|
- AArch64: Remove zva_128 from memset
|
||||||
|
- AArch64: Optimize memset
|
||||||
|
- AArch64: Improve generic strlen
|
||||||
|
- assert: Add test for CVE-2025-0395
|
||||||
|
|
||||||
|
* Wed Mar 12 2025 xiajimei <xiejiamei@hygon.cn> - 2.38-56
|
||||||
|
- x86: Enable non-temporal memset for Hygon processors
|
||||||
|
- x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS
|
||||||
|
- x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
|
||||||
|
- x86: Disable non-temporal memset on Skylake Server
|
||||||
|
- x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable
|
||||||
|
- x86: Enable non-temporal memset tunable for AMD
|
||||||
|
- x86: Add seperate non-temporal tunable for memset
|
||||||
|
- x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
|
||||||
|
- x86: Add cache information support for Hygon processors
|
||||||
|
- x86: Add new architecture type for Hygon processors
|
||||||
|
|
||||||
|
* Sat Mar 08 2025 shixuantong <shixuantong1@huawei.com> - 2.38-55
|
||||||
|
- elf: Keep using minimal malloc after early DTV resize
|
||||||
|
|
||||||
|
* Tue Feb 18 2025 shixuantong <shixuantong1@huawei.com> - 2.38-54
|
||||||
|
- glibc testcase use newly built ld.so instead of environment default installed ld.so
|
||||||
|
|
||||||
|
* Thu Feb 06 2025 shixuantong <shixuantong1@huawei.com> - 2.38-53
|
||||||
|
- fix CVE-2019-1010023
|
||||||
|
|
||||||
* Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
|
* Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
|
||||||
- stdlib: Test using setenv with updated environ [BZ #32588]
|
- stdlib: Test using setenv with updated environ [BZ #32588]
|
||||||
- Fix underallocation of abort_msg_s struct (CVE-2025-0395)
|
- Fix underallocation of abort_msg_s struct (CVE-2025-0395)
|
||||||
|
|||||||
39
math-Improve-layout-of-exp-exp10-data.patch
Normal file
39
math-Improve-layout-of-exp-exp10-data.patch
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
From 5a08d049dc5037e89eb95bb1506652f0043fa39e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Fri, 13 Dec 2024 15:43:07 +0000
|
||||||
|
Subject: [PATCH] math: Improve layout of exp/exp10 data
|
||||||
|
|
||||||
|
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
|
||||||
|
changes the exp_data struct slightly so that the fields are better aligned
|
||||||
|
and without gaps. As a result on targets that support them, more load-pair
|
||||||
|
instructions are used in exp.
|
||||||
|
|
||||||
|
The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
|
||||||
|
Neoverse V2.
|
||||||
|
|
||||||
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
|
||||||
|
---
|
||||||
|
sysdeps/ieee754/dbl-64/math_config.h | 3 ++-
|
||||||
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
|
||||||
|
index 19af33fd86..52b720ecd1 100644
|
||||||
|
--- a/sysdeps/ieee754/dbl-64/math_config.h
|
||||||
|
+++ b/sysdeps/ieee754/dbl-64/math_config.h
|
||||||
|
@@ -195,10 +195,11 @@ check_uflow (double x)
|
||||||
|
extern const struct exp_data
|
||||||
|
{
|
||||||
|
double invln2N;
|
||||||
|
- double shift;
|
||||||
|
double negln2hiN;
|
||||||
|
double negln2loN;
|
||||||
|
double poly[4]; /* Last four coefficients. */
|
||||||
|
+ double shift;
|
||||||
|
+
|
||||||
|
double exp2_shift;
|
||||||
|
double exp2_poly[EXP2_POLY_ORDER];
|
||||||
|
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
34
math-Improve-layout-of-expf-data.patch
Normal file
34
math-Improve-layout-of-expf-data.patch
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
From 3de5112326a4274c97f154f3d335c11965ee960c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||||
|
Date: Wed, 24 Jul 2024 15:17:47 +0100
|
||||||
|
Subject: [PATCH] math: Improve layout of expf data
|
||||||
|
|
||||||
|
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
|
||||||
|
changes the exp2f_data struct slightly so that the fields are better aligned.
|
||||||
|
As a result on targets that support them, load-pair instructions accessing
|
||||||
|
poly_scaled and invln2_scaled are now 16-byte aligned.
|
||||||
|
|
||||||
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
|
||||||
|
---
|
||||||
|
sysdeps/ieee754/flt-32/math_config.h | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
|
||||||
|
index d1b06a1a90..5904eb9bac 100644
|
||||||
|
--- a/sysdeps/ieee754/flt-32/math_config.h
|
||||||
|
+++ b/sysdeps/ieee754/flt-32/math_config.h
|
||||||
|
@@ -166,9 +166,9 @@ extern const struct exp2f_data
|
||||||
|
uint64_t tab[1 << EXP2F_TABLE_BITS];
|
||||||
|
double shift_scaled;
|
||||||
|
double poly[EXP2F_POLY_ORDER];
|
||||||
|
- double shift;
|
||||||
|
double invln2_scaled;
|
||||||
|
double poly_scaled[EXP2F_POLY_ORDER];
|
||||||
|
+ double shift;
|
||||||
|
} __exp2f_data attribute_hidden;
|
||||||
|
|
||||||
|
#define LOGF_TABLE_BITS 4
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user