!983 [sync] PR-982: backport form glibc upstream 2.38 branch
From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229
This commit is contained in:
commit
de3e1f1fa3
89
Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
Normal file
89
Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
From c32fd59314c343db88c3ea4a203870481d33c3d2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||||
|
Date: Tue, 21 Jan 2025 16:11:06 -0500
|
||||||
|
Subject: [PATCH] Fix underallocation of abort_msg_s struct
|
||||||
|
(CVE-2025-0395)
|
||||||
|
|
||||||
|
Include the space needed to store the length of the message itself, in
|
||||||
|
addition to the message string. This resolves BZ #32582.
|
||||||
|
|
||||||
|
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||||
|
Reviewed: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578)
|
||||||
|
---
|
||||||
|
NEWS | 6 ++++++
|
||||||
|
assert/assert.c | 4 +++-
|
||||||
|
sysdeps/posix/libc_fatal.c | 4 +++-
|
||||||
|
3 files changed, 12 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/NEWS b/NEWS
|
||||||
|
index d0815514e0..3e511d6de4 100644
|
||||||
|
--- a/NEWS
|
||||||
|
+++ b/NEWS
|
||||||
|
@@ -34,6 +34,11 @@ Security related changes:
|
||||||
|
buffer overflow, which could be exploited to achieve escalated
|
||||||
|
privileges. This flaw was introduced in glibc 2.34.
|
||||||
|
|
||||||
|
+ CVE-2025-0395: When the assert() function fails, it does not allocate
|
||||||
|
+ enough space for the assertion failure message string and size
|
||||||
|
+ information, which may lead to a buffer overflow if the message string
|
||||||
|
+ size aligns to page size.
|
||||||
|
+
|
||||||
|
The following bugs are resolved with this release:
|
||||||
|
|
||||||
|
[27821] ungetc: Fix backup buffer leak on program exit
|
||||||
|
@@ -61,6 +66,7 @@ The following bugs are resolved with this release:
|
||||||
|
[32137] libio: Attempt wide backup free only for non-legacy code
|
||||||
|
[32231] elf: Change ldconfig auxcache magic number
|
||||||
|
[32470] x86: Avoid integer truncation with large cache sizes
|
||||||
|
+ [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395)
|
||||||
|
|
||||||
|
Version 2.38
|
||||||
|
|
||||||
|
diff --git a/assert/assert.c b/assert/assert.c
|
||||||
|
index b7c7a4a1ba..65a9fedf0d 100644
|
||||||
|
--- a/assert/assert.c
|
||||||
|
+++ b/assert/assert.c
|
||||||
|
@@ -18,6 +18,7 @@
|
||||||
|
#include <assert.h>
|
||||||
|
#include <atomic.h>
|
||||||
|
#include <ldsodefs.h>
|
||||||
|
+#include <libc-pointer-arith.h>
|
||||||
|
#include <libintl.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
@@ -64,7 +65,8 @@ __assert_fail_base (const char *fmt, const char *assertion, const char *file,
|
||||||
|
(void) __fxprintf (NULL, "%s", str);
|
||||||
|
(void) fflush (stderr);
|
||||||
|
|
||||||
|
- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
|
||||||
|
+ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
|
||||||
|
+ GLRO(dl_pagesize));
|
||||||
|
struct abort_msg_s *buf = __mmap (NULL, total, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_ANON | MAP_PRIVATE, -1, 0);
|
||||||
|
if (__glibc_likely (buf != MAP_FAILED))
|
||||||
|
diff --git a/sysdeps/posix/libc_fatal.c b/sysdeps/posix/libc_fatal.c
|
||||||
|
index 70edcc10c1..5b9e4b7918 100644
|
||||||
|
--- a/sysdeps/posix/libc_fatal.c
|
||||||
|
+++ b/sysdeps/posix/libc_fatal.c
|
||||||
|
@@ -20,6 +20,7 @@
|
||||||
|
#include <errno.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <ldsodefs.h>
|
||||||
|
+#include <libc-pointer-arith.h>
|
||||||
|
#include <paths.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
@@ -123,7 +124,8 @@ __libc_message (const char *fmt, ...)
|
||||||
|
|
||||||
|
WRITEV_FOR_FATAL (fd, iov, nlist, total);
|
||||||
|
|
||||||
|
- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
|
||||||
|
+ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
|
||||||
|
+ GLRO(dl_pagesize));
|
||||||
|
struct abort_msg_s *buf = __mmap (NULL, total,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_ANON | MAP_PRIVATE, -1, 0);
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
50
elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
Normal file
50
elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
From 48642ef1a5721e0a7694d84fe46d83b6086dfe75 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Florian Weimer <fweimer@redhat.com>
|
||||||
|
Date: Mon, 3 Jun 2024 10:49:40 +0200
|
||||||
|
Subject: [PATCH] elf: Avoid some free (NULL) calls in
|
||||||
|
_dl_update_slotinfo
|
||||||
|
|
||||||
|
This has been confirmed to work around some interposed mallocs. Here
|
||||||
|
is a discussion of the impact test ust/libc-wrapper/test_libc-wrapper
|
||||||
|
in lttng-tools:
|
||||||
|
|
||||||
|
New TLS usage in libgcc_s.so.1, compatibility impact
|
||||||
|
<https://inbox.sourceware.org/libc-alpha/8734v1ieke.fsf@oldenburg.str.redhat.com/>
|
||||||
|
|
||||||
|
Reportedly, this patch also papers over a similar issue when tcmalloc
|
||||||
|
2.9.1 is not compiled with -ftls-model=initial-exec. Of course the
|
||||||
|
goal really should be to compile mallocs with the initial-exec TLS
|
||||||
|
model, but this commit appears to be a useful interim workaround.
|
||||||
|
|
||||||
|
Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
|
||||||
|
tls access after dlopen [BZ #19924]").
|
||||||
|
|
||||||
|
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||||
|
(cherry picked from commit afe42e935b3ee97bac9a7064157587777259c60e)
|
||||||
|
---
|
||||||
|
elf/dl-tls.c | 9 ++++++++-
|
||||||
|
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
|
||||||
|
index 70446e71a8..de0168319c 100644
|
||||||
|
--- a/elf/dl-tls.c
|
||||||
|
+++ b/elf/dl-tls.c
|
||||||
|
@@ -819,7 +819,14 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
|
||||||
|
dtv entry free it. Note: this is not AS-safe. */
|
||||||
|
/* XXX Ideally we will at some point create a memory
|
||||||
|
pool. */
|
||||||
|
- free (dtv[modid].pointer.to_free);
|
||||||
|
+ /* Avoid calling free on a null pointer. Some mallocs
|
||||||
|
+ incorrectly use dynamic TLS, and depending on how the
|
||||||
|
+ free function was compiled, it could call
|
||||||
|
+ __tls_get_addr before the null pointer check in the
|
||||||
|
+ free implementation. Checking here papers over at
|
||||||
|
+ least some dynamic TLS usage by interposed mallocs. */
|
||||||
|
+ if (dtv[modid].pointer.to_free != NULL)
|
||||||
|
+ free (dtv[modid].pointer.to_free);
|
||||||
|
dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
|
||||||
|
dtv[modid].pointer.to_free = NULL;
|
||||||
|
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
328
elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
Normal file
328
elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
Normal file
@ -0,0 +1,328 @@
|
|||||||
|
From 7772f9358c9a947251196ea7844b339f0a423ff6 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||||
|
Date: Tue, 16 Feb 2021 12:55:13 +0000
|
||||||
|
Subject: [PATCH] elf: Fix slow tls access after dlopen [BZ #19924]
|
||||||
|
|
||||||
|
In short: __tls_get_addr checks the global generation counter and if
|
||||||
|
the current dtv is older then _dl_update_slotinfo updates dtv up to the
|
||||||
|
generation of the accessed module. So if the global generation is newer
|
||||||
|
than generation of the module then __tls_get_addr keeps hitting the
|
||||||
|
slow dtv update path. The dtv update path includes a number of checks
|
||||||
|
to see if any update is needed and this already causes measurable tls
|
||||||
|
access slow down after dlopen.
|
||||||
|
|
||||||
|
It may be possible to detect up-to-date dtv faster. But if there are
|
||||||
|
many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at
|
||||||
|
least walking the slotinfo list.
|
||||||
|
|
||||||
|
This patch tries to update the dtv to the global generation instead, so
|
||||||
|
after a dlopen the tls access slow path is only hit once. The modules
|
||||||
|
with larger generation than the accessed one were not necessarily
|
||||||
|
synchronized before, so additional synchronization is needed.
|
||||||
|
|
||||||
|
This patch uses acquire/release synchronization when accessing the
|
||||||
|
generation counter.
|
||||||
|
|
||||||
|
Note: in the x86_64 version of dl-tls.c the generation is only loaded
|
||||||
|
once, since relaxed mo is not faster than acquire mo load.
|
||||||
|
|
||||||
|
I have not benchmarked this. Tested by Adhemerval Zanella on aarch64,
|
||||||
|
powerpc, sparc, x86 who reported that it fixes the performance issue
|
||||||
|
of bug 19924.
|
||||||
|
|
||||||
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
(cherry picked from commit d2123d68275acc0f061e73d5f86ca504e0d5a344)
|
||||||
|
---
|
||||||
|
elf/dl-close.c | 2 +-
|
||||||
|
elf/dl-open.c | 8 +--
|
||||||
|
elf/dl-reloc.c | 6 +-
|
||||||
|
elf/dl-tls.c | 117 ++++++++++++++++++++-----------------
|
||||||
|
sysdeps/generic/ldsodefs.h | 3 +-
|
||||||
|
sysdeps/x86_64/dl-tls.c | 4 +-
|
||||||
|
6 files changed, 74 insertions(+), 66 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/elf/dl-close.c b/elf/dl-close.c
|
||||||
|
index b887a44888..1c7a861db1 100644
|
||||||
|
--- a/elf/dl-close.c
|
||||||
|
+++ b/elf/dl-close.c
|
||||||
|
@@ -703,7 +703,7 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||||
|
if (__glibc_unlikely (newgen == 0))
|
||||||
|
_dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n");
|
||||||
|
/* Can be read concurrently. */
|
||||||
|
- atomic_store_relaxed (&GL(dl_tls_generation), newgen);
|
||||||
|
+ atomic_store_release (&GL(dl_tls_generation), newgen);
|
||||||
|
|
||||||
|
if (tls_free_end == GL(dl_tls_static_used))
|
||||||
|
GL(dl_tls_static_used) = tls_free_start;
|
||||||
|
diff --git a/elf/dl-open.c b/elf/dl-open.c
|
||||||
|
index 2d985e21d8..351931af04 100644
|
||||||
|
--- a/elf/dl-open.c
|
||||||
|
+++ b/elf/dl-open.c
|
||||||
|
@@ -405,7 +405,7 @@ update_tls_slotinfo (struct link_map *new)
|
||||||
|
_dl_fatal_printf (N_("\
|
||||||
|
TLS generation counter wrapped! Please report this."));
|
||||||
|
/* Can be read concurrently. */
|
||||||
|
- atomic_store_relaxed (&GL(dl_tls_generation), newgen);
|
||||||
|
+ atomic_store_release (&GL(dl_tls_generation), newgen);
|
||||||
|
|
||||||
|
/* We need a second pass for static tls data, because
|
||||||
|
_dl_update_slotinfo must not be run while calls to
|
||||||
|
@@ -422,8 +422,8 @@ TLS generation counter wrapped! Please report this."));
|
||||||
|
now, but we can delay updating the DTV. */
|
||||||
|
imap->l_need_tls_init = 0;
|
||||||
|
#ifdef SHARED
|
||||||
|
- /* Update the slot information data for at least the
|
||||||
|
- generation of the DSO we are allocating data for. */
|
||||||
|
+ /* Update the slot information data for the current
|
||||||
|
+ generation. */
|
||||||
|
|
||||||
|
/* FIXME: This can terminate the process on memory
|
||||||
|
allocation failure. It is not possible to raise
|
||||||
|
@@ -431,7 +431,7 @@ TLS generation counter wrapped! Please report this."));
|
||||||
|
_dl_update_slotinfo would have to be split into two
|
||||||
|
operations, similar to resize_scopes and update_scopes
|
||||||
|
above. This is related to bug 16134. */
|
||||||
|
- _dl_update_slotinfo (imap->l_tls_modid);
|
||||||
|
+ _dl_update_slotinfo (imap->l_tls_modid, newgen);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
dl_init_static_tls (imap);
|
||||||
|
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
|
||||||
|
index 1d558c1e0c..e5c555d82c 100644
|
||||||
|
--- a/elf/dl-reloc.c
|
||||||
|
+++ b/elf/dl-reloc.c
|
||||||
|
@@ -112,11 +112,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional)
|
||||||
|
if (map->l_real->l_relocated)
|
||||||
|
{
|
||||||
|
#ifdef SHARED
|
||||||
|
+ /* Update the DTV of the current thread. Note: GL(dl_load_tls_lock)
|
||||||
|
+ is held here so normal load of the generation counter is valid. */
|
||||||
|
if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
|
||||||
|
0))
|
||||||
|
- /* Update the slot information data for at least the generation of
|
||||||
|
- the DSO we are allocating data for. */
|
||||||
|
- (void) _dl_update_slotinfo (map->l_tls_modid);
|
||||||
|
+ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
dl_init_static_tls (map);
|
||||||
|
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
|
||||||
|
index 1f6f820819..70446e71a8 100644
|
||||||
|
--- a/elf/dl-tls.c
|
||||||
|
+++ b/elf/dl-tls.c
|
||||||
|
@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map)
|
||||||
|
|
||||||
|
|
||||||
|
struct link_map *
|
||||||
|
-_dl_update_slotinfo (unsigned long int req_modid)
|
||||||
|
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
|
||||||
|
{
|
||||||
|
struct link_map *the_map = NULL;
|
||||||
|
dtv_t *dtv = THREAD_DTV ();
|
||||||
|
|
||||||
|
- /* The global dl_tls_dtv_slotinfo array contains for each module
|
||||||
|
- index the generation counter current when the entry was created.
|
||||||
|
+ /* CONCURRENCY NOTES:
|
||||||
|
+
|
||||||
|
+ The global dl_tls_dtv_slotinfo_list array contains for each module
|
||||||
|
+ index the generation counter current when that entry was updated.
|
||||||
|
This array never shrinks so that all module indices which were
|
||||||
|
- valid at some time can be used to access it. Before the first
|
||||||
|
- use of a new module index in this function the array was extended
|
||||||
|
- appropriately. Access also does not have to be guarded against
|
||||||
|
- modifications of the array. It is assumed that pointer-size
|
||||||
|
- values can be read atomically even in SMP environments. It is
|
||||||
|
- possible that other threads at the same time dynamically load
|
||||||
|
- code and therefore add to the slotinfo list. This is a problem
|
||||||
|
- since we must not pick up any information about incomplete work.
|
||||||
|
- The solution to this is to ignore all dtv slots which were
|
||||||
|
- created after the one we are currently interested. We know that
|
||||||
|
- dynamic loading for this module is completed and this is the last
|
||||||
|
- load operation we know finished. */
|
||||||
|
- unsigned long int idx = req_modid;
|
||||||
|
+ valid at some time can be used to access it. Concurrent loading
|
||||||
|
+ and unloading of modules can update slotinfo entries or extend
|
||||||
|
+ the array. The updates happen under the GL(dl_load_tls_lock) and
|
||||||
|
+ finish with the release store of the generation counter to
|
||||||
|
+ GL(dl_tls_generation) which is synchronized with the load of
|
||||||
|
+ new_gen in the caller. So updates up to new_gen are synchronized
|
||||||
|
+ but updates for later generations may not be.
|
||||||
|
+
|
||||||
|
+ Here we update the thread dtv from old_gen (== dtv[0].counter) to
|
||||||
|
+ new_gen generation. For this, each dtv[i] entry is either set to
|
||||||
|
+ an unallocated state (set), or left unmodified (nop). Where (set)
|
||||||
|
+ may resize the dtv first if modid i >= dtv[-1].counter. The rules
|
||||||
|
+ for the decision between (set) and (nop) are
|
||||||
|
+
|
||||||
|
+ (1) If slotinfo entry i is concurrently updated then either (set)
|
||||||
|
+ or (nop) is valid: TLS access cannot use dtv[i] unless it is
|
||||||
|
+ synchronized with a generation > new_gen.
|
||||||
|
+
|
||||||
|
+ Otherwise, if the generation of slotinfo entry i is gen and the
|
||||||
|
+ loaded module for this entry is map then
|
||||||
|
+
|
||||||
|
+ (2) If gen <= old_gen then do (nop).
|
||||||
|
+
|
||||||
|
+ (3) If old_gen < gen <= new_gen then
|
||||||
|
+ (3.1) if map != 0 then (set)
|
||||||
|
+ (3.2) if map == 0 then either (set) or (nop).
|
||||||
|
+
|
||||||
|
+ Note that (1) cannot be reliably detected, but since both actions
|
||||||
|
+ are valid it does not have to be. Only (2) and (3.1) cases need
|
||||||
|
+ to be distinguished for which relaxed mo access of gen and map is
|
||||||
|
+ enough: their value is synchronized when it matters.
|
||||||
|
+
|
||||||
|
+ Note that a relaxed mo load may give an out-of-thin-air value since
|
||||||
|
+ it is used in decisions that can affect concurrent stores. But this
|
||||||
|
+ should only happen if the OOTA value causes UB that justifies the
|
||||||
|
+ concurrent store of the value. This is not expected to be an issue
|
||||||
|
+ in practice. */
|
||||||
|
struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
|
||||||
|
|
||||||
|
- while (idx >= listp->len)
|
||||||
|
+ if (dtv[0].counter < new_gen)
|
||||||
|
{
|
||||||
|
- idx -= listp->len;
|
||||||
|
- listp = listp->next;
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
- if (dtv[0].counter < listp->slotinfo[idx].gen)
|
||||||
|
- {
|
||||||
|
- /* CONCURRENCY NOTES:
|
||||||
|
-
|
||||||
|
- Here the dtv needs to be updated to new_gen generation count.
|
||||||
|
-
|
||||||
|
- This code may be called during TLS access when GL(dl_load_tls_lock)
|
||||||
|
- is not held. In that case the user code has to synchronize with
|
||||||
|
- dlopen and dlclose calls of relevant modules. A module m is
|
||||||
|
- relevant if the generation of m <= new_gen and dlclose of m is
|
||||||
|
- synchronized: a memory access here happens after the dlopen and
|
||||||
|
- before the dlclose of relevant modules. The dtv entries for
|
||||||
|
- relevant modules need to be updated, other entries can be
|
||||||
|
- arbitrary.
|
||||||
|
-
|
||||||
|
- This e.g. means that the first part of the slotinfo list can be
|
||||||
|
- accessed race free, but the tail may be concurrently extended.
|
||||||
|
- Similarly relevant slotinfo entries can be read race free, but
|
||||||
|
- other entries are racy. However updating a non-relevant dtv
|
||||||
|
- entry does not affect correctness. For a relevant module m,
|
||||||
|
- max_modid >= modid of m. */
|
||||||
|
- size_t new_gen = listp->slotinfo[idx].gen;
|
||||||
|
size_t total = 0;
|
||||||
|
size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
|
||||||
|
assert (max_modid >= req_modid);
|
||||||
|
@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int req_modid)
|
||||||
|
{
|
||||||
|
size_t modid = total + cnt;
|
||||||
|
|
||||||
|
- /* Later entries are not relevant. */
|
||||||
|
+ /* Case (1) for all later modids. */
|
||||||
|
if (modid > max_modid)
|
||||||
|
break;
|
||||||
|
|
||||||
|
size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
|
||||||
|
|
||||||
|
+ /* Case (1). */
|
||||||
|
if (gen > new_gen)
|
||||||
|
- /* Not relevant. */
|
||||||
|
continue;
|
||||||
|
|
||||||
|
- /* If the entry is older than the current dtv layout we
|
||||||
|
- know we don't have to handle it. */
|
||||||
|
+ /* Case (2) or (1). */
|
||||||
|
if (gen <= dtv[0].counter)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
+ /* Case (3) or (1). */
|
||||||
|
+
|
||||||
|
/* If there is no map this means the entry is empty. */
|
||||||
|
struct link_map *map
|
||||||
|
= atomic_load_relaxed (&listp->slotinfo[cnt].map);
|
||||||
|
/* Check whether the current dtv array is large enough. */
|
||||||
|
if (dtv[-1].counter < modid)
|
||||||
|
{
|
||||||
|
+ /* Case (3.2) or (1). */
|
||||||
|
if (map == NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
- /* Resize the dtv. */
|
||||||
|
+ /* Resizing the dtv aborts on failure: bug 16134. */
|
||||||
|
dtv = _dl_resize_dtv (dtv, max_modid);
|
||||||
|
|
||||||
|
assert (modid <= dtv[-1].counter);
|
||||||
|
@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int req_modid)
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If there is currently memory allocate for this
|
||||||
|
- dtv entry free it. */
|
||||||
|
+ dtv entry free it. Note: this is not AS-safe. */
|
||||||
|
/* XXX Ideally we will at some point create a memory
|
||||||
|
pool. */
|
||||||
|
free (dtv[modid].pointer.to_free);
|
||||||
|
@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
|
||||||
|
|
||||||
|
static struct link_map *
|
||||||
|
__attribute_noinline__
|
||||||
|
-update_get_addr (GET_ADDR_ARGS)
|
||||||
|
+update_get_addr (GET_ADDR_ARGS, size_t gen)
|
||||||
|
{
|
||||||
|
- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
|
||||||
|
+ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
|
||||||
|
dtv_t *dtv = THREAD_DTV ();
|
||||||
|
|
||||||
|
void *p = dtv[GET_ADDR_MODULE].pointer.val;
|
||||||
|
@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS)
|
||||||
|
dtv_t *dtv = THREAD_DTV ();
|
||||||
|
|
||||||
|
/* Update is needed if dtv[0].counter < the generation of the accessed
|
||||||
|
- module. The global generation counter is used here as it is easier
|
||||||
|
- to check. Synchronization for the relaxed MO access is guaranteed
|
||||||
|
- by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */
|
||||||
|
+ module, but the global generation counter is easier to check (which
|
||||||
|
+ must be synchronized up to the generation of the accessed module by
|
||||||
|
+ user code doing the TLS access so relaxed mo read is enough). */
|
||||||
|
size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
|
||||||
|
if (__glibc_unlikely (dtv[0].counter != gen))
|
||||||
|
- return update_get_addr (GET_ADDR_PARAM);
|
||||||
|
+ {
|
||||||
|
+ /* Update DTV up to the global generation, see CONCURRENCY NOTES
|
||||||
|
+ in _dl_update_slotinfo. */
|
||||||
|
+ gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||||||
|
+ return update_get_addr (GET_ADDR_PARAM, gen);
|
||||||
|
+ }
|
||||||
|
|
||||||
|
void *p = dtv[GET_ADDR_MODULE].pointer.val;
|
||||||
|
|
||||||
|
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
|
||||||
|
index e8b7359b04..ed69c6babd 100644
|
||||||
|
--- a/sysdeps/generic/ldsodefs.h
|
||||||
|
+++ b/sysdeps/generic/ldsodefs.h
|
||||||
|
@@ -1251,7 +1251,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
|
||||||
|
|
||||||
|
/* Update slot information data for at least the generation of the
|
||||||
|
module with the given index. */
|
||||||
|
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
|
||||||
|
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
|
||||||
|
+ size_t gen)
|
||||||
|
attribute_hidden;
|
||||||
|
|
||||||
|
/* Look up the module's TLS block as for __tls_get_addr,
|
||||||
|
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
|
||||||
|
index 7a7fe38625..e9b6ab9970 100644
|
||||||
|
--- a/sysdeps/x86_64/dl-tls.c
|
||||||
|
+++ b/sysdeps/x86_64/dl-tls.c
|
||||||
|
@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
|
||||||
|
{
|
||||||
|
dtv_t *dtv = THREAD_DTV ();
|
||||||
|
|
||||||
|
- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
|
||||||
|
+ size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||||||
|
if (__glibc_unlikely (dtv[0].counter != gen))
|
||||||
|
- return update_get_addr (GET_ADDR_PARAM);
|
||||||
|
+ return update_get_addr (GET_ADDR_PARAM, gen);
|
||||||
|
|
||||||
|
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
|
||||||
|
}
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
521
elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
Normal file
521
elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
Normal file
@ -0,0 +1,521 @@
|
|||||||
|
From 549e7f7c5a94f5ccbab2ad5f1babca05028a31c7 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Florian Weimer <fweimer@redhat.com>
|
||||||
|
Date: Mon, 1 Jul 2024 17:42:04 +0200
|
||||||
|
Subject: [PATCH] elf: Support recursive use of dynamic TLS in interposed
|
||||||
|
malloc
|
||||||
|
|
||||||
|
It turns out that quite a few applications use bundled mallocs that
|
||||||
|
have been built to use global-dynamic TLS (instead of the recommended
|
||||||
|
initial-exec TLS). The previous workaround from
|
||||||
|
commit afe42e935b3ee97bac9a7064157587777259c60e ("elf: Avoid some
|
||||||
|
free (NULL) calls in _dl_update_slotinfo") does not fix all
|
||||||
|
encountered cases unfortunatelly.
|
||||||
|
|
||||||
|
This change avoids the TLS generation update for recursive use
|
||||||
|
of TLS from a malloc that was called during a TLS update. This
|
||||||
|
is possible because an interposed malloc has a fixed module ID and
|
||||||
|
TLS slot. (It cannot be unloaded.) If an initially-loaded module ID
|
||||||
|
is encountered in __tls_get_addr and the dynamic linker is already
|
||||||
|
in the middle of a TLS update, use the outdated DTV, thus avoiding
|
||||||
|
another call into malloc. It's still necessary to update the
|
||||||
|
DTV to the most recent generation, to get out of the slow path,
|
||||||
|
which is why the check for recursion is needed.
|
||||||
|
|
||||||
|
The bookkeeping is done using a global counter instead of per-thread
|
||||||
|
flag because TLS access in the dynamic linker is tricky.
|
||||||
|
|
||||||
|
All this will go away once the dynamic linker stops using malloc
|
||||||
|
for TLS, likely as part of a change that pre-allocates all TLS
|
||||||
|
during pthread_create/dlopen.
|
||||||
|
|
||||||
|
Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
|
||||||
|
tls access after dlopen [BZ #19924]").
|
||||||
|
|
||||||
|
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||||
|
(cherry picked from commit 018f0fc3b818d4d1460a4e2384c24802504b1d20)
|
||||||
|
|
||||||
|
Conflict: adapt file "elf/Makefile" for patch "elf: Switch to main
|
||||||
|
malloc after final ld.so self-relocation"
|
||||||
|
---
|
||||||
|
elf/Makefile | 26 +++++++++
|
||||||
|
elf/dl-tls.c | 95 +++++++++++++++++++++++++++++---
|
||||||
|
elf/rtld.c | 2 +
|
||||||
|
elf/tst-recursive-tls.c | 60 ++++++++++++++++++++
|
||||||
|
elf/tst-recursive-tlsmallocmod.c | 64 +++++++++++++++++++++
|
||||||
|
elf/tst-recursive-tlsmodN.c | 28 ++++++++++
|
||||||
|
sysdeps/generic/ldsodefs.h | 14 +++++
|
||||||
|
sysdeps/x86_64/dl-tls.c | 5 +-
|
||||||
|
8 files changed, 284 insertions(+), 10 deletions(-)
|
||||||
|
create mode 100644 elf/tst-recursive-tls.c
|
||||||
|
create mode 100644 elf/tst-recursive-tlsmallocmod.c
|
||||||
|
create mode 100644 elf/tst-recursive-tlsmodN.c
|
||||||
|
|
||||||
|
diff --git a/elf/Makefile b/elf/Makefile
|
||||||
|
index ea98cba8..391f29e9 100644
|
||||||
|
--- a/elf/Makefile
|
||||||
|
+++ b/elf/Makefile
|
||||||
|
@@ -433,6 +433,7 @@ tests += \
|
||||||
|
tst-p_align1 \
|
||||||
|
tst-p_align2 \
|
||||||
|
tst-p_align3 \
|
||||||
|
+ tst-recursive-tls \
|
||||||
|
tst-relsort1 \
|
||||||
|
tst-ro-dynamic \
|
||||||
|
tst-rtld-no-malloc \
|
||||||
|
@@ -865,6 +866,23 @@ modules-names += \
|
||||||
|
tst-null-argv-lib \
|
||||||
|
tst-p_alignmod-base \
|
||||||
|
tst-p_alignmod3 \
|
||||||
|
+ tst-recursive-tlsmallocmod \
|
||||||
|
+ tst-recursive-tlsmod0 \
|
||||||
|
+ tst-recursive-tlsmod1 \
|
||||||
|
+ tst-recursive-tlsmod2 \
|
||||||
|
+ tst-recursive-tlsmod3 \
|
||||||
|
+ tst-recursive-tlsmod4 \
|
||||||
|
+ tst-recursive-tlsmod5 \
|
||||||
|
+ tst-recursive-tlsmod6 \
|
||||||
|
+ tst-recursive-tlsmod7 \
|
||||||
|
+ tst-recursive-tlsmod8 \
|
||||||
|
+ tst-recursive-tlsmod9 \
|
||||||
|
+ tst-recursive-tlsmod10 \
|
||||||
|
+ tst-recursive-tlsmod11 \
|
||||||
|
+ tst-recursive-tlsmod12 \
|
||||||
|
+ tst-recursive-tlsmod13 \
|
||||||
|
+ tst-recursive-tlsmod14 \
|
||||||
|
+ tst-recursive-tlsmod15 \
|
||||||
|
tst-relsort1mod1 \
|
||||||
|
tst-relsort1mod2 \
|
||||||
|
tst-ro-dynamic-mod \
|
||||||
|
@@ -3042,6 +3060,14 @@ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
|
||||||
|
CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
|
||||||
|
endif
|
||||||
|
|
||||||
|
+$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so
|
||||||
|
+# More objects than DTV_SURPLUS, to trigger DTV reallocation.
|
||||||
|
+$(objpfx)tst-recursive-tls.out: \
|
||||||
|
+ $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \
|
||||||
|
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
|
||||||
|
+$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c
|
||||||
|
+ $(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$*
|
||||||
|
+
|
||||||
|
# Reuse an audit module which provides ample debug logging.
|
||||||
|
tst-rtld-no-malloc-audit-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
|
||||||
|
|
||||||
|
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
|
||||||
|
index de016831..59d4021e 100644
|
||||||
|
--- a/elf/dl-tls.c
|
||||||
|
+++ b/elf/dl-tls.c
|
||||||
|
@@ -75,6 +75,31 @@
|
||||||
|
/* Default for dl_tls_static_optional. */
|
||||||
|
#define OPTIONAL_TLS 512
|
||||||
|
|
||||||
|
+/* Used to count the number of threads currently executing dynamic TLS
|
||||||
|
+ updates. Used to avoid recursive malloc calls in __tls_get_addr
|
||||||
|
+ for an interposed malloc that uses global-dynamic TLS (which is not
|
||||||
|
+ recommended); see _dl_tls_allocate_active checks. This could be a
|
||||||
|
+ per-thread flag, but would need TLS access in the dynamic linker. */
|
||||||
|
+unsigned int _dl_tls_threads_in_update;
|
||||||
|
+
|
||||||
|
+static inline void
|
||||||
|
+_dl_tls_allocate_begin (void)
|
||||||
|
+{
|
||||||
|
+ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline void
|
||||||
|
+_dl_tls_allocate_end (void)
|
||||||
|
+{
|
||||||
|
+ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool
|
||||||
|
+_dl_tls_allocate_active (void)
|
||||||
|
+{
|
||||||
|
+ return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* Compute the static TLS surplus based on the namespace count and the
|
||||||
|
TLS space that can be used for optimizations. */
|
||||||
|
static inline int
|
||||||
|
@@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void)
|
||||||
|
size += TLS_PRE_TCB_SIZE;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
- /* Perform the allocation. Reserve space for the required alignment
|
||||||
|
- and the pointer to the original allocation. */
|
||||||
|
+ /* Reserve space for the required alignment and the pointer to the
|
||||||
|
+ original allocation. */
|
||||||
|
size_t alignment = GLRO (dl_tls_static_align);
|
||||||
|
+
|
||||||
|
+ /* Perform the allocation. */
|
||||||
|
+ _dl_tls_allocate_begin ();
|
||||||
|
void *allocated = malloc (size + alignment + sizeof (void *));
|
||||||
|
if (__glibc_unlikely (allocated == NULL))
|
||||||
|
- return NULL;
|
||||||
|
+ {
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
+ return NULL;
|
||||||
|
+ }
|
||||||
|
|
||||||
|
/* Perform alignment and allocate the DTV. */
|
||||||
|
#if TLS_TCB_AT_TP
|
||||||
|
@@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void)
|
||||||
|
result = allocate_dtv (result);
|
||||||
|
if (result == NULL)
|
||||||
|
free (allocated);
|
||||||
|
+
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
|
||||||
|
size_t newsize = max_modid + DTV_SURPLUS;
|
||||||
|
size_t oldsize = dtv[-1].counter;
|
||||||
|
|
||||||
|
+ _dl_tls_allocate_begin ();
|
||||||
|
if (dtv == GL(dl_initial_dtv))
|
||||||
|
{
|
||||||
|
/* This is the initial dtv that was either statically allocated in
|
||||||
|
@@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
|
||||||
|
if (newp == NULL)
|
||||||
|
oom ();
|
||||||
|
}
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
|
||||||
|
newp[0].counter = newsize;
|
||||||
|
|
||||||
|
@@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size)
|
||||||
|
if (powerof2 (alignment) && alignment <= _Alignof (max_align_t))
|
||||||
|
{
|
||||||
|
/* The alignment is supported by malloc. */
|
||||||
|
+ _dl_tls_allocate_begin ();
|
||||||
|
void *ptr = malloc (size);
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
return (struct dtv_pointer) { ptr, ptr };
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size)
|
||||||
|
|
||||||
|
/* Perform the allocation. This is the pointer we need to free
|
||||||
|
later. */
|
||||||
|
+ _dl_tls_allocate_begin ();
|
||||||
|
void *start = malloc (alloc_size);
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
+
|
||||||
|
if (start == NULL)
|
||||||
|
return (struct dtv_pointer) {};
|
||||||
|
|
||||||
|
@@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
|
||||||
|
free implementation. Checking here papers over at
|
||||||
|
least some dynamic TLS usage by interposed mallocs. */
|
||||||
|
if (dtv[modid].pointer.to_free != NULL)
|
||||||
|
- free (dtv[modid].pointer.to_free);
|
||||||
|
+ {
|
||||||
|
+ _dl_tls_allocate_begin ();
|
||||||
|
+ free (dtv[modid].pointer.to_free);
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
+ }
|
||||||
|
dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
|
||||||
|
dtv[modid].pointer.to_free = NULL;
|
||||||
|
|
||||||
|
@@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS)
|
||||||
|
size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
|
||||||
|
if (__glibc_unlikely (dtv[0].counter != gen))
|
||||||
|
{
|
||||||
|
- /* Update DTV up to the global generation, see CONCURRENCY NOTES
|
||||||
|
- in _dl_update_slotinfo. */
|
||||||
|
- gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||||||
|
- return update_get_addr (GET_ADDR_PARAM, gen);
|
||||||
|
+ if (_dl_tls_allocate_active ()
|
||||||
|
+ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)
|
||||||
|
+ /* This is a reentrant __tls_get_addr call, but we can
|
||||||
|
+ satisfy it because it's an initially-loaded module ID.
|
||||||
|
+ These TLS slotinfo slots do not change, so the
|
||||||
|
+ out-of-date generation counter does not matter. However,
|
||||||
|
+ if not in a TLS update, still update_get_addr below, to
|
||||||
|
+ get off the slow path eventually. */
|
||||||
|
+ ;
|
||||||
|
+ else
|
||||||
|
+ {
|
||||||
|
+ /* Update DTV up to the global generation, see CONCURRENCY NOTES
|
||||||
|
+ in _dl_update_slotinfo. */
|
||||||
|
+ gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||||||
|
+ return update_get_addr (GET_ADDR_PARAM, gen);
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
void *p = dtv[GET_ADDR_MODULE].pointer.val;
|
||||||
|
@@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS)
|
||||||
|
|
||||||
|
return (char *) p + GET_ADDR_OFFSET;
|
||||||
|
}
|
||||||
|
-#endif
|
||||||
|
+#endif /* SHARED */
|
||||||
|
|
||||||
|
|
||||||
|
/* Look up the module's TLS block as for __tls_get_addr,
|
||||||
|
@@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l)
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
+size_t _dl_tls_initial_modid_limit;
|
||||||
|
+
|
||||||
|
+void
|
||||||
|
+_dl_tls_initial_modid_limit_setup (void)
|
||||||
|
+{
|
||||||
|
+ struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
|
||||||
|
+ size_t idx;
|
||||||
|
+ for (idx = 0; idx < listp->len; ++idx)
|
||||||
|
+ {
|
||||||
|
+ struct link_map *l = listp->slotinfo[idx].map;
|
||||||
|
+ if (l == NULL
|
||||||
|
+ /* The object can be unloaded, so its modid can be
|
||||||
|
+ reassociated. */
|
||||||
|
+ || !(l->l_type == lt_executable || l->l_type == lt_library))
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ _dl_tls_initial_modid_limit = idx;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
|
||||||
|
void
|
||||||
|
_dl_add_to_slotinfo (struct link_map *l, bool do_add)
|
||||||
|
@@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
|
||||||
|
the first slot. */
|
||||||
|
assert (idx == 0);
|
||||||
|
|
||||||
|
+ _dl_tls_allocate_begin ();
|
||||||
|
listp = (struct dtv_slotinfo_list *)
|
||||||
|
malloc (sizeof (struct dtv_slotinfo_list)
|
||||||
|
+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
|
||||||
|
+ _dl_tls_allocate_end ();
|
||||||
|
if (listp == NULL)
|
||||||
|
{
|
||||||
|
/* We ran out of memory while resizing the dtv slotinfo list. */
|
||||||
|
diff --git a/elf/rtld.c b/elf/rtld.c
|
||||||
|
index 558733b8..0a1e202c 100644
|
||||||
|
--- a/elf/rtld.c
|
||||||
|
+++ b/elf/rtld.c
|
||||||
|
@@ -789,6 +789,8 @@ init_tls (size_t naudit)
|
||||||
|
_dl_fatal_printf ("\
|
||||||
|
cannot allocate TLS data structures for initial thread\n");
|
||||||
|
|
||||||
|
+ _dl_tls_initial_modid_limit_setup ();
|
||||||
|
+
|
||||||
|
/* Store for detection of the special case by __tls_get_addr
|
||||||
|
so it knows not to pass this dtv to the normal realloc. */
|
||||||
|
GL(dl_initial_dtv) = GET_DTV (tcbp);
|
||||||
|
diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..716d1f78
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/elf/tst-recursive-tls.c
|
||||||
|
@@ -0,0 +1,60 @@
|
||||||
|
+/* Test with interposed malloc with dynamic TLS.
|
||||||
|
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <array_length.h>
|
||||||
|
+#include <stdio.h>
|
||||||
|
+#include <support/check.h>
|
||||||
|
+#include <support/xdlfcn.h>
|
||||||
|
+
|
||||||
|
+/* Defined in tst-recursive-tlsmallocmod.so. */
|
||||||
|
+extern __thread unsigned int malloc_subsytem_counter;
|
||||||
|
+
|
||||||
|
+static int
|
||||||
|
+do_test (void)
|
||||||
|
+{
|
||||||
|
+ /* 16 is large enough to exercise the DTV resizing case. */
|
||||||
|
+ void *handles[16];
|
||||||
|
+
|
||||||
|
+ for (unsigned int i = 0; i < array_length (handles); ++i)
|
||||||
|
+ {
|
||||||
|
+ /* Re-use the TLS slot for module 0. */
|
||||||
|
+ if (i > 0)
|
||||||
|
+ xdlclose (handles[0]);
|
||||||
|
+
|
||||||
|
+ char soname[30];
|
||||||
|
+ snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i);
|
||||||
|
+ handles[i] = xdlopen (soname, RTLD_NOW);
|
||||||
|
+
|
||||||
|
+ if (i > 0)
|
||||||
|
+ {
|
||||||
|
+ handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW);
|
||||||
|
+ int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0");
|
||||||
|
+ /* May trigger TLS storage allocation using malloc. */
|
||||||
|
+ TEST_COMPARE (fptr (), 0);
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ for (unsigned int i = 0; i < array_length (handles); ++i)
|
||||||
|
+ xdlclose (handles[i]);
|
||||||
|
+
|
||||||
|
+ printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter);
|
||||||
|
+ TEST_VERIFY (malloc_subsytem_counter > 0);
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#include <support/test-driver.c>
|
||||||
|
diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..c24e9945
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/elf/tst-recursive-tlsmallocmod.c
|
||||||
|
@@ -0,0 +1,64 @@
|
||||||
|
+/* Interposed malloc with dynamic TLS.
|
||||||
|
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <stdlib.h>
|
||||||
|
+#include <dlfcn.h>
|
||||||
|
+
|
||||||
|
+__thread unsigned int malloc_subsytem_counter;
|
||||||
|
+
|
||||||
|
+static __typeof (malloc) *malloc_fptr;
|
||||||
|
+static __typeof (free) *free_fptr;
|
||||||
|
+static __typeof (calloc) *calloc_fptr;
|
||||||
|
+static __typeof (realloc) *realloc_fptr;
|
||||||
|
+
|
||||||
|
+static void __attribute__ ((constructor))
|
||||||
|
+init (void)
|
||||||
|
+{
|
||||||
|
+ malloc_fptr = dlsym (RTLD_NEXT, "malloc");
|
||||||
|
+ free_fptr = dlsym (RTLD_NEXT, "free");
|
||||||
|
+ calloc_fptr = dlsym (RTLD_NEXT, "calloc");
|
||||||
|
+ realloc_fptr = dlsym (RTLD_NEXT, "realloc");
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void *
|
||||||
|
+malloc (size_t size)
|
||||||
|
+{
|
||||||
|
+ ++malloc_subsytem_counter;
|
||||||
|
+ return malloc_fptr (size);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void
|
||||||
|
+free (void *ptr)
|
||||||
|
+{
|
||||||
|
+ ++malloc_subsytem_counter;
|
||||||
|
+ return free_fptr (ptr);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void *
|
||||||
|
+calloc (size_t a, size_t b)
|
||||||
|
+{
|
||||||
|
+ ++malloc_subsytem_counter;
|
||||||
|
+ return calloc_fptr (a, b);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void *
|
||||||
|
+realloc (void *ptr, size_t size)
|
||||||
|
+{
|
||||||
|
+ ++malloc_subsytem_counter;
|
||||||
|
+ return realloc_fptr (ptr, size);
|
||||||
|
+}
|
||||||
|
diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..bb7592ae
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/elf/tst-recursive-tlsmodN.c
|
||||||
|
@@ -0,0 +1,28 @@
|
||||||
|
+/* Test module with global-dynamic TLS. Used to trigger DTV reallocation.
|
||||||
|
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+/* Compiled with VAR and FUNC set via -D. FUNC requires some
|
||||||
|
+ relocation against TLS variable VAR. */
|
||||||
|
+
|
||||||
|
+__thread int VAR;
|
||||||
|
+
|
||||||
|
+int
|
||||||
|
+FUNC (void)
|
||||||
|
+{
|
||||||
|
+ return VAR;
|
||||||
|
+}
|
||||||
|
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
|
||||||
|
index 22fbbecd..ad271ae0 100644
|
||||||
|
--- a/sysdeps/generic/ldsodefs.h
|
||||||
|
+++ b/sysdeps/generic/ldsodefs.h
|
||||||
|
@@ -1262,6 +1262,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
|
||||||
|
size_t gen)
|
||||||
|
attribute_hidden;
|
||||||
|
|
||||||
|
+/* The last TLS module ID that is initially loaded, plus 1. TLS
|
||||||
|
+ addresses for modules with IDs lower than that can be obtained from
|
||||||
|
+ the DTV even if its generation is outdated. */
|
||||||
|
+extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro;
|
||||||
|
+
|
||||||
|
+/* Compute _dl_tls_initial_modid_limit. To be called after initial
|
||||||
|
+ relocation. */
|
||||||
|
+void _dl_tls_initial_modid_limit_setup (void) attribute_hidden;
|
||||||
|
+
|
||||||
|
+/* Number of threads currently in a TLS update. This is used to
|
||||||
|
+ detect reentrant __tls_get_addr calls without a per-thread
|
||||||
|
+ flag. */
|
||||||
|
+extern unsigned int _dl_tls_threads_in_update attribute_hidden;
|
||||||
|
+
|
||||||
|
/* Look up the module's TLS block as for __tls_get_addr,
|
||||||
|
but never touch anything. Return null if it's not allocated yet. */
|
||||||
|
extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden;
|
||||||
|
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
|
||||||
|
index e9b6ab99..c484f39e 100644
|
||||||
|
--- a/sysdeps/x86_64/dl-tls.c
|
||||||
|
+++ b/sysdeps/x86_64/dl-tls.c
|
||||||
|
@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
|
||||||
|
dtv_t *dtv = THREAD_DTV ();
|
||||||
|
|
||||||
|
size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||||||
|
- if (__glibc_unlikely (dtv[0].counter != gen))
|
||||||
|
+ if (__glibc_unlikely (dtv[0].counter != gen)
|
||||||
|
+ /* See comment in __tls_get_addr in elf/dl-tls.c. */
|
||||||
|
+ && !(_dl_tls_allocate_active ()
|
||||||
|
+ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
|
||||||
|
return update_get_addr (GET_ADDR_PARAM, gen);
|
||||||
|
|
||||||
|
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
34
glibc.spec
34
glibc.spec
@ -67,7 +67,7 @@
|
|||||||
##############################################################################
|
##############################################################################
|
||||||
Name: glibc
|
Name: glibc
|
||||||
Version: 2.38
|
Version: 2.38
|
||||||
Release: 51
|
Release: 52
|
||||||
Summary: The GNU libc libraries
|
Summary: The GNU libc libraries
|
||||||
License: %{all_license}
|
License: %{all_license}
|
||||||
URL: http://www.gnu.org/software/glibc/
|
URL: http://www.gnu.org/software/glibc/
|
||||||
@ -252,6 +252,21 @@ Patch162: nptl-initialize-rseq-area-prior-to-registration.patch
|
|||||||
Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch
|
Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch
|
||||||
Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch
|
Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch
|
||||||
Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch
|
Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch
|
||||||
|
Patch166: x86_64-Sort-fpu-multiarch-Makefile.patch
|
||||||
|
Patch167: x86_64-Add-log2-with-FMA.patch
|
||||||
|
Patch168: x86_64-Add-expm1-with-FMA.patch
|
||||||
|
Patch169: x86_64-Add-log1p-with-FMA.patch
|
||||||
|
Patch170: x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
|
||||||
|
Patch171: elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
|
||||||
|
Patch172: x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
|
||||||
|
Patch173: sysdeps-x86-Makefile-Split-and-sort-tests.patch
|
||||||
|
Patch174: x86_64-Fix-missing-wcsncat-function-definition-witho.patch
|
||||||
|
Patch175: x86-Improve-large-memset-perf-with-non-temporal-stor.patch
|
||||||
|
Patch176: x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
|
||||||
|
Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
|
||||||
|
Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
|
||||||
|
Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
|
||||||
|
Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
|
||||||
|
|
||||||
#openEuler patch list
|
#openEuler patch list
|
||||||
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
|
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
|
||||||
@ -1471,6 +1486,23 @@ fi
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
|
||||||
|
- stdlib: Test using setenv with updated environ [BZ #32588]
|
||||||
|
- Fix underallocation of abort_msg_s struct (CVE-2025-0395)
|
||||||
|
- elf: Support recursive use of dynamic TLS in interposed malloc
|
||||||
|
- elf: Avoid some free (NULL) calls in _dl_update_slotinfo
|
||||||
|
- x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212]
|
||||||
|
- x86: Improve large memset perf with non-temporal stores [RHEL-29312]
|
||||||
|
- x86_64: Fix missing wcsncat function definition without multiarch (x86-64-v4)
|
||||||
|
- sysdeps/x86/Makefile: Split and sort tests
|
||||||
|
- x86: Only align destination to 1x VEC_SIZE in memset 4x loop
|
||||||
|
- elf: Fix slow tls access after dlopen [BZ #19924]
|
||||||
|
- x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
|
||||||
|
- x86_64: Add log1p with FMA
|
||||||
|
- x86_64: Add expm1 with FMA
|
||||||
|
- x86_64: Add log2 with FMA
|
||||||
|
- x86_64: Sort fpu/multiarch/Makefile
|
||||||
|
|
||||||
* Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51
|
* Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51
|
||||||
- x86: Set preferred CPU features and default NT threshold for Zhaoxin processors
|
- x86: Set preferred CPU features and default NT threshold for Zhaoxin processors
|
||||||
|
|
||||||
|
|||||||
75
stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
Normal file
75
stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
From 650a0aaaffa9ddb44732fa6156b31c5f30ee596f Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Fri, 24 Jan 2025 18:53:13 +0800
|
||||||
|
Subject: [PATCH] stdlib: Test using setenv with updated environ [BZ
|
||||||
|
#32588]
|
||||||
|
|
||||||
|
Add a test for setenv with updated environ. Verify that BZ #32588 is
|
||||||
|
fixed.
|
||||||
|
|
||||||
|
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
Reviewed-by: Florian Weimer <fweimer@redhat.com>
|
||||||
|
(cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
|
||||||
|
---
|
||||||
|
stdlib/Makefile | 1 +
|
||||||
|
stdlib/tst-setenv-environ.c | 36 ++++++++++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 37 insertions(+)
|
||||||
|
create mode 100644 stdlib/tst-setenv-environ.c
|
||||||
|
|
||||||
|
diff --git a/stdlib/Makefile b/stdlib/Makefile
|
||||||
|
index 25e42a77e7..750810ee92 100644
|
||||||
|
--- a/stdlib/Makefile
|
||||||
|
+++ b/stdlib/Makefile
|
||||||
|
@@ -232,6 +232,7 @@ tests := \
|
||||||
|
tst-setcontext7 \
|
||||||
|
tst-setcontext8 \
|
||||||
|
tst-setcontext9 \
|
||||||
|
+ tst-setenv-environ \
|
||||||
|
tst-strfmon_l \
|
||||||
|
tst-strfrom \
|
||||||
|
tst-strfrom-locale \
|
||||||
|
diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..02fcef96d0
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/stdlib/tst-setenv-environ.c
|
||||||
|
@@ -0,0 +1,36 @@
|
||||||
|
+/* Test using setenv with updated environ.
|
||||||
|
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <stdlib.h>
|
||||||
|
+#include <support/check.h>
|
||||||
|
+
|
||||||
|
+extern char **environ;
|
||||||
|
+
|
||||||
|
+int
|
||||||
|
+do_test (void)
|
||||||
|
+{
|
||||||
|
+ char *valp;
|
||||||
|
+ static char *dummy_environ[] = { NULL };
|
||||||
|
+ environ = dummy_environ;
|
||||||
|
+ setenv ("A", "1", 0);
|
||||||
|
+ valp = getenv ("A");
|
||||||
|
+ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#include <support/test-driver.c>
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
178
sysdeps-x86-Makefile-Split-and-sort-tests.patch
Normal file
178
sysdeps-x86-Makefile-Split-and-sort-tests.patch
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
From 0d14bf0754ee8d8cf2bf3dad298fa5c5f97537db Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Thu, 7 Dec 2023 09:00:11 -0800
|
||||||
|
Subject: [PATCH] sysdeps/x86/Makefile: Split and sort tests
|
||||||
|
|
||||||
|
Put each test on a separate line and sort tests.
|
||||||
|
|
||||||
|
(cherry picked from commit 7e03e0de7e7c2de975b5c5e18f5a4b0c75816674)
|
||||||
|
---
|
||||||
|
sysdeps/x86/Makefile | 110 ++++++++++++++++++++++++++++++-------------
|
||||||
|
1 file changed, 78 insertions(+), 32 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||||
|
index 917c26f116..5631a59a26 100644
|
||||||
|
--- a/sysdeps/x86/Makefile
|
||||||
|
+++ b/sysdeps/x86/Makefile
|
||||||
|
@@ -10,36 +10,51 @@ sysdep_headers += sys/platform/x86.h bits/platform/x86.h
|
||||||
|
CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags)
|
||||||
|
CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector)
|
||||||
|
|
||||||
|
-tests += tst-get-cpu-features tst-get-cpu-features-static \
|
||||||
|
- tst-cpu-features-cpuinfo tst-cpu-features-cpuinfo-static \
|
||||||
|
- tst-cpu-features-supports tst-cpu-features-supports-static
|
||||||
|
-tests-static += tst-get-cpu-features-static \
|
||||||
|
- tst-cpu-features-cpuinfo-static \
|
||||||
|
- tst-cpu-features-supports-static
|
||||||
|
+tests += \
|
||||||
|
+ tst-get-cpu-features \
|
||||||
|
+ tst-get-cpu-features-static \
|
||||||
|
+ tst-cpu-features-cpuinfo \
|
||||||
|
+ tst-cpu-features-cpuinfo-static \
|
||||||
|
+ tst-cpu-features-supports \
|
||||||
|
+ tst-cpu-features-supports-static \
|
||||||
|
+# tests
|
||||||
|
+tests-static += \
|
||||||
|
+ tst-get-cpu-features-static \
|
||||||
|
+ tst-cpu-features-cpuinfo-static \
|
||||||
|
+ tst-cpu-features-supports-static \
|
||||||
|
+# tests-static
|
||||||
|
ifeq (yes,$(have-ifunc))
|
||||||
|
ifeq (yes,$(have-gcc-ifunc))
|
||||||
|
tests += \
|
||||||
|
tst-ifunc-isa-1 \
|
||||||
|
- tst-ifunc-isa-1-static
|
||||||
|
+ tst-ifunc-isa-1-static \
|
||||||
|
+# tests
|
||||||
|
tests-static += \
|
||||||
|
- tst-ifunc-isa-1-static
|
||||||
|
+ tst-ifunc-isa-1-static \
|
||||||
|
+# tests-static
|
||||||
|
test-xfail-tst-ifunc-isa-1 = $(with-lld)
|
||||||
|
test-xfail-tst-ifunc-isa-1-static = $(with-lld)
|
||||||
|
tests += \
|
||||||
|
tst-ifunc-isa-2 \
|
||||||
|
- tst-ifunc-isa-2-static
|
||||||
|
+ tst-ifunc-isa-2-static \
|
||||||
|
+# tests
|
||||||
|
tests-static += \
|
||||||
|
- tst-ifunc-isa-2-static
|
||||||
|
+ tst-ifunc-isa-2-static \
|
||||||
|
+# tests-static
|
||||||
|
test-xfail-tst-ifunc-isa-2 = $(with-lld)
|
||||||
|
test-xfail-tst-ifunc-isa-2-static = $(with-lld)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq (yes,$(enable-x86-isa-level))
|
||||||
|
-tests += tst-isa-level-1
|
||||||
|
-modules-names += tst-isa-level-mod-1-baseline \
|
||||||
|
- tst-isa-level-mod-1-v2 \
|
||||||
|
- tst-isa-level-mod-1-v3 \
|
||||||
|
- tst-isa-level-mod-1-v4 \
|
||||||
|
+tests += \
|
||||||
|
+ tst-isa-level-1 \
|
||||||
|
+# tests
|
||||||
|
+modules-names += \
|
||||||
|
+ tst-isa-level-mod-1-baseline \
|
||||||
|
+ tst-isa-level-mod-1-v2 \
|
||||||
|
+ tst-isa-level-mod-1-v3 \
|
||||||
|
+ tst-isa-level-mod-1-v4 \
|
||||||
|
+# modules-names
|
||||||
|
|
||||||
|
# X86 ISA level baseline
|
||||||
|
CFLAGS-tst-isa-level-mod-1-baseline.c += -DINCLUDE_X86_ISA_LEVEL \
|
||||||
|
@@ -68,14 +83,18 @@ tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(subdir),math)
|
||||||
|
-tests += tst-ldbl-nonnormal-printf
|
||||||
|
+tests += \
|
||||||
|
+ tst-ldbl-nonnormal-printf \
|
||||||
|
+# tests
|
||||||
|
endif # $(subdir) == math
|
||||||
|
|
||||||
|
ifeq ($(subdir),setjmp)
|
||||||
|
gen-as-const-headers += jmp_buf-ssp.sym
|
||||||
|
sysdep_routines += __longjmp_cancel
|
||||||
|
ifneq ($(enable-cet),no)
|
||||||
|
-tests += tst-setjmp-cet
|
||||||
|
+tests += \
|
||||||
|
+ tst-setjmp-cet \
|
||||||
|
+# tests
|
||||||
|
tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
@@ -122,20 +141,45 @@ ifneq ($(enable-cet),no)
|
||||||
|
ifeq ($(subdir),elf)
|
||||||
|
sysdep-dl-routines += dl-cet
|
||||||
|
|
||||||
|
-tests += tst-cet-legacy-1 tst-cet-legacy-1a tst-cet-legacy-2 \
|
||||||
|
- tst-cet-legacy-2a tst-cet-legacy-3 tst-cet-legacy-4 \
|
||||||
|
- tst-cet-legacy-5a tst-cet-legacy-6a tst-cet-legacy-7 \
|
||||||
|
- tst-cet-legacy-8 tst-cet-legacy-9 tst-cet-legacy-9-static \
|
||||||
|
- tst-cet-legacy-10 tst-cet-legacy-10-static
|
||||||
|
-tests-static += tst-cet-legacy-9-static tst-cet-legacy-10-static
|
||||||
|
+tests += \
|
||||||
|
+ tst-cet-legacy-1 \
|
||||||
|
+ tst-cet-legacy-1a \
|
||||||
|
+ tst-cet-legacy-2 \
|
||||||
|
+ tst-cet-legacy-2a \
|
||||||
|
+ tst-cet-legacy-3 \
|
||||||
|
+ tst-cet-legacy-4 \
|
||||||
|
+ tst-cet-legacy-5a \
|
||||||
|
+ tst-cet-legacy-6a \
|
||||||
|
+ tst-cet-legacy-7 \
|
||||||
|
+ tst-cet-legacy-8 \
|
||||||
|
+ tst-cet-legacy-9 \
|
||||||
|
+ tst-cet-legacy-9-static \
|
||||||
|
+ tst-cet-legacy-10 \
|
||||||
|
+ tst-cet-legacy-10-static \
|
||||||
|
+# tests
|
||||||
|
+tests-static += \
|
||||||
|
+ tst-cet-legacy-9-static \
|
||||||
|
+ tst-cet-legacy-10-static \
|
||||||
|
+# tests-static
|
||||||
|
tst-cet-legacy-1a-ARGS = -- $(host-test-program-cmd)
|
||||||
|
-tests += tst-cet-legacy-4a tst-cet-legacy-4b tst-cet-legacy-4c \
|
||||||
|
- tst-cet-legacy-5b tst-cet-legacy-6b
|
||||||
|
-modules-names += tst-cet-legacy-mod-1 tst-cet-legacy-mod-2 \
|
||||||
|
- tst-cet-legacy-mod-4 tst-cet-legacy-mod-5a \
|
||||||
|
- tst-cet-legacy-mod-5b tst-cet-legacy-mod-5c \
|
||||||
|
- tst-cet-legacy-mod-6a tst-cet-legacy-mod-6b \
|
||||||
|
- tst-cet-legacy-mod-6c
|
||||||
|
+tests += \
|
||||||
|
+ tst-cet-legacy-4a \
|
||||||
|
+ tst-cet-legacy-4b \
|
||||||
|
+ tst-cet-legacy-4c \
|
||||||
|
+ tst-cet-legacy-5b \
|
||||||
|
+ tst-cet-legacy-6b \
|
||||||
|
+# tests
|
||||||
|
+modules-names += \
|
||||||
|
+ tst-cet-legacy-mod-1 \
|
||||||
|
+ tst-cet-legacy-mod-2 \
|
||||||
|
+ tst-cet-legacy-mod-4 \
|
||||||
|
+ tst-cet-legacy-mod-5a \
|
||||||
|
+ tst-cet-legacy-mod-5b \
|
||||||
|
+ tst-cet-legacy-mod-5c \
|
||||||
|
+ tst-cet-legacy-mod-6a \
|
||||||
|
+ tst-cet-legacy-mod-6b \
|
||||||
|
+ tst-cet-legacy-mod-6c \
|
||||||
|
+# modules-names
|
||||||
|
|
||||||
|
CFLAGS-tst-cet-legacy-2.c += -fcf-protection=branch
|
||||||
|
CFLAGS-tst-cet-legacy-2a.c += -fcf-protection
|
||||||
|
@@ -243,7 +287,9 @@ endif
|
||||||
|
ifeq ($(subdir),posix)
|
||||||
|
tests += \
|
||||||
|
tst-sysconf-cache-linesize \
|
||||||
|
- tst-sysconf-cache-linesize-static
|
||||||
|
+ tst-sysconf-cache-linesize-static \
|
||||||
|
+# tests
|
||||||
|
tests-static += \
|
||||||
|
- tst-sysconf-cache-linesize-static
|
||||||
|
+ tst-sysconf-cache-linesize-static \
|
||||||
|
+# tests-static
|
||||||
|
endif
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
77
x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
Normal file
77
x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
From 58822f954f6284c8687dfff43fa4e9e349eeccad Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Mon, 28 Aug 2023 12:08:14 -0700
|
||||||
|
Subject: [PATCH] x86: Check the lower byte of EAX of CPUID leaf 2 [BZ
|
||||||
|
#30643]
|
||||||
|
|
||||||
|
The old Intel software developer manual specified that the low byte of
|
||||||
|
EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
|
||||||
|
CPUDID leaf 2 was needed to retrieve the complete cache information. The
|
||||||
|
newer Intel manual has been changed to that it should always return 1
|
||||||
|
and be ignored. If the lower byte isn't 1, CPUID leaf 2 can't be used.
|
||||||
|
In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead. If
|
||||||
|
CPUID leaf 4 doesn't contain the cache information, cache information
|
||||||
|
isn't available at all. This addresses BZ #30643.
|
||||||
|
|
||||||
|
(cherry picked from commit 1493622f4f9048ffede3fbedb64695efa49d662a)
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 31 +++++++++++++------------------
|
||||||
|
1 file changed, 13 insertions(+), 18 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index 6c7740422a..400d15f208 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
|
||||||
|
++round;
|
||||||
|
}
|
||||||
|
/* There is no other cache information anywhere else. */
|
||||||
|
- break;
|
||||||
|
+ return -1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
|
||||||
|
|
||||||
|
/* OK, we can use the CPUID instruction to get all info about the
|
||||||
|
caches. */
|
||||||
|
- unsigned int cnt = 0;
|
||||||
|
- unsigned int max = 1;
|
||||||
|
long int result = 0;
|
||||||
|
bool no_level_2_or_3 = false;
|
||||||
|
bool has_level_2 = false;
|
||||||
|
+ unsigned int eax;
|
||||||
|
+ unsigned int ebx;
|
||||||
|
+ unsigned int ecx;
|
||||||
|
+ unsigned int edx;
|
||||||
|
+ __cpuid (2, eax, ebx, ecx, edx);
|
||||||
|
|
||||||
|
- while (cnt++ < max)
|
||||||
|
+ /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
|
||||||
|
+ should be ignored. If it isn't 1, use CPUID leaf 4 instead. */
|
||||||
|
+ if ((eax & 0xff) != 1)
|
||||||
|
+ return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
|
||||||
|
+ cpu_features);
|
||||||
|
+ else
|
||||||
|
{
|
||||||
|
- unsigned int eax;
|
||||||
|
- unsigned int ebx;
|
||||||
|
- unsigned int ecx;
|
||||||
|
- unsigned int edx;
|
||||||
|
- __cpuid (2, eax, ebx, ecx, edx);
|
||||||
|
-
|
||||||
|
- /* The low byte of EAX in the first round contain the number of
|
||||||
|
- rounds we have to make. At least one, the one we are already
|
||||||
|
- doing. */
|
||||||
|
- if (cnt == 1)
|
||||||
|
- {
|
||||||
|
- max = eax & 0xff;
|
||||||
|
- eax &= 0xffffff00;
|
||||||
|
- }
|
||||||
|
+ eax &= 0xffffff00;
|
||||||
|
|
||||||
|
/* Process the individual registers' value. */
|
||||||
|
result = intel_check_word (name, eax, &has_level_2,
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
254
x86-Improve-large-memset-perf-with-non-temporal-stor.patch
Normal file
254
x86-Improve-large-memset-perf-with-non-temporal-stor.patch
Normal file
@ -0,0 +1,254 @@
|
|||||||
|
From 04b8d484323b2ff18b3422c4b883ef4cb6281c53 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Fri, 24 May 2024 12:38:50 -0500
|
||||||
|
Subject: [PATCH] x86: Improve large memset perf with non-temporal stores
|
||||||
|
[RHEL-29312]
|
||||||
|
|
||||||
|
Previously we use `rep stosb` for all medium/large memsets. This is
|
||||||
|
notably worse than non-temporal stores for large (above a
|
||||||
|
few MBs) memsets.
|
||||||
|
See:
|
||||||
|
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
|
||||||
|
For data using different stategies for large memset on ICX and SKX.
|
||||||
|
|
||||||
|
Using non-temporal stores can be up to 3x faster on ICX and 2x faster
|
||||||
|
on SKX. Historically, these numbers would not have been so good
|
||||||
|
because of the zero-over-zero writeback optimization that `rep stosb`
|
||||||
|
is able to do. But, the zero-over-zero writeback optimization has been
|
||||||
|
removed as a potential side-channel attack, so there is no longer any
|
||||||
|
good reason to only rely on `rep stosb` for large memsets. On the flip
|
||||||
|
size, non-temporal writes can avoid data in their RFO requests saving
|
||||||
|
memory bandwidth.
|
||||||
|
|
||||||
|
All of the other changes to the file are to re-organize the
|
||||||
|
code-blocks to maintain "good" alignment given the new code added in
|
||||||
|
the `L(stosb_local)` case.
|
||||||
|
|
||||||
|
The results from running the GLIBC memset benchmarks on TGL-client for
|
||||||
|
N=20 runs:
|
||||||
|
|
||||||
|
Geometric Mean across the suite New / Old EXEX256: 0.979
|
||||||
|
Geometric Mean across the suite New / Old EXEX512: 0.979
|
||||||
|
Geometric Mean across the suite New / Old AVX2 : 0.986
|
||||||
|
Geometric Mean across the suite New / Old SSE2 : 0.979
|
||||||
|
|
||||||
|
Most of the cases are essentially unchanged, this is mostly to show
|
||||||
|
that adding the non-temporal case didn't add any regressions to the
|
||||||
|
other cases.
|
||||||
|
|
||||||
|
The results on the memset-large benchmark suite on TGL-client for N=20
|
||||||
|
runs:
|
||||||
|
|
||||||
|
Geometric Mean across the suite New / Old EXEX256: 0.926
|
||||||
|
Geometric Mean across the suite New / Old EXEX512: 0.925
|
||||||
|
Geometric Mean across the suite New / Old AVX2 : 0.928
|
||||||
|
Geometric Mean across the suite New / Old SSE2 : 0.924
|
||||||
|
|
||||||
|
So roughly a 7.5% speedup. This is lower than what we see on servers
|
||||||
|
(likely because clients typically have faster single-core bandwidth so
|
||||||
|
saving bandwidth on RFOs is less impactful), but still advantageous.
|
||||||
|
|
||||||
|
Full test-suite passes on x86_64 w/ and w/o multiarch.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
|
||||||
|
(cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f)
|
||||||
|
---
|
||||||
|
.../multiarch/memset-vec-unaligned-erms.S | 147 +++++++++++-------
|
||||||
|
1 file changed, 91 insertions(+), 56 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
index 0f0636b90f..aba45e3da0 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
@@ -21,8 +21,13 @@
|
||||||
|
2. If size is less than VEC, use integer register stores.
|
||||||
|
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
||||||
|
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
||||||
|
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
||||||
|
- 4 VEC stores and store 4 * VEC at a time until done. */
|
||||||
|
+ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
||||||
|
+ 4 VEC stores and store 4 * VEC at a time until done.
|
||||||
|
+ 6. On machines ERMS feature, if size is range
|
||||||
|
+ [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
|
||||||
|
+ then REP STOSB will be used.
|
||||||
|
+ 7. If size >= __x86_shared_non_temporal_threshold, use a
|
||||||
|
+ non-temporal stores. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
@@ -145,6 +150,41 @@ L(entry_from_wmemset):
|
||||||
|
VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx)
|
||||||
|
VMOVU %VMM(0), (%rdi)
|
||||||
|
VZEROUPPER_RETURN
|
||||||
|
+
|
||||||
|
+ /* If have AVX512 mask instructions put L(less_vec) close to
|
||||||
|
+ entry as it doesn't take much space and is likely a hot target. */
|
||||||
|
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||||
|
+ /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */
|
||||||
|
+ .p2align 6,, 47
|
||||||
|
+ .p2align 4
|
||||||
|
+L(less_vec):
|
||||||
|
+L(less_vec_from_wmemset):
|
||||||
|
+ /* Less than 1 VEC. */
|
||||||
|
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||||
|
+# error Unsupported VEC_SIZE!
|
||||||
|
+# endif
|
||||||
|
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
||||||
|
+ cross check. Note that we are using rax which is set in
|
||||||
|
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
||||||
|
+ andl $(PAGE_SIZE - 1), %edi
|
||||||
|
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
|
||||||
|
+ serious performance degradation when it has to fault suppress. */
|
||||||
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||||
|
+ /* This is generally considered a cold target. */
|
||||||
|
+ ja L(cross_page)
|
||||||
|
+# if VEC_SIZE > 32
|
||||||
|
+ movq $-1, %rcx
|
||||||
|
+ bzhiq %rdx, %rcx, %rcx
|
||||||
|
+ kmovq %rcx, %k1
|
||||||
|
+# else
|
||||||
|
+ movl $-1, %ecx
|
||||||
|
+ bzhil %edx, %ecx, %ecx
|
||||||
|
+ kmovd %ecx, %k1
|
||||||
|
+# endif
|
||||||
|
+ vmovdqu8 %VMM(0), (%rax){%k1}
|
||||||
|
+ VZEROUPPER_RETURN
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
END (MEMSET_SYMBOL (__memset, unaligned))
|
||||||
|
|
||||||
|
@@ -183,54 +223,6 @@ L(last_2x_vec):
|
||||||
|
#endif
|
||||||
|
VZEROUPPER_RETURN
|
||||||
|
|
||||||
|
- /* If have AVX512 mask instructions put L(less_vec) close to
|
||||||
|
- entry as it doesn't take much space and is likely a hot target.
|
||||||
|
- */
|
||||||
|
-#ifdef USE_LESS_VEC_MASK_STORE
|
||||||
|
- .p2align 4,, 10
|
||||||
|
-L(less_vec):
|
||||||
|
-L(less_vec_from_wmemset):
|
||||||
|
- /* Less than 1 VEC. */
|
||||||
|
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||||
|
-# error Unsupported VEC_SIZE!
|
||||||
|
-# endif
|
||||||
|
- /* Clear high bits from edi. Only keeping bits relevant to page
|
||||||
|
- cross check. Note that we are using rax which is set in
|
||||||
|
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
||||||
|
- andl $(PAGE_SIZE - 1), %edi
|
||||||
|
- /* Check if VEC_SIZE store cross page. Mask stores suffer
|
||||||
|
- serious performance degradation when it has to fault suppress.
|
||||||
|
- */
|
||||||
|
- cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||||
|
- /* This is generally considered a cold target. */
|
||||||
|
- ja L(cross_page)
|
||||||
|
-# if VEC_SIZE > 32
|
||||||
|
- movq $-1, %rcx
|
||||||
|
- bzhiq %rdx, %rcx, %rcx
|
||||||
|
- kmovq %rcx, %k1
|
||||||
|
-# else
|
||||||
|
- movl $-1, %ecx
|
||||||
|
- bzhil %edx, %ecx, %ecx
|
||||||
|
- kmovd %ecx, %k1
|
||||||
|
-# endif
|
||||||
|
- vmovdqu8 %VMM(0), (%rax){%k1}
|
||||||
|
- VZEROUPPER_RETURN
|
||||||
|
-
|
||||||
|
-# if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
- /* Include L(stosb_local) here if including L(less_vec) between
|
||||||
|
- L(stosb_more_2x_vec) and ENTRY. This is to cache align the
|
||||||
|
- L(stosb_more_2x_vec) target. */
|
||||||
|
- .p2align 4,, 10
|
||||||
|
-L(stosb_local):
|
||||||
|
- movzbl %sil, %eax
|
||||||
|
- mov %RDX_LP, %RCX_LP
|
||||||
|
- mov %RDI_LP, %RDX_LP
|
||||||
|
- rep stosb
|
||||||
|
- mov %RDX_LP, %RAX_LP
|
||||||
|
- VZEROUPPER_RETURN
|
||||||
|
-# endif
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
.p2align 4
|
||||||
|
L(stosb_more_2x_vec):
|
||||||
|
@@ -316,21 +308,33 @@ L(return_vzeroupper):
|
||||||
|
ret
|
||||||
|
#endif
|
||||||
|
|
||||||
|
- .p2align 4,, 10
|
||||||
|
-#ifndef USE_LESS_VEC_MASK_STORE
|
||||||
|
-# if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
+#ifdef USE_WITH_AVX2
|
||||||
|
+ .p2align 4
|
||||||
|
+#else
|
||||||
|
+ .p2align 4,, 4
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||||
|
range for 2-byte jump encoding. */
|
||||||
|
L(stosb_local):
|
||||||
|
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||||
|
+ jae L(nt_memset)
|
||||||
|
movzbl %sil, %eax
|
||||||
|
mov %RDX_LP, %RCX_LP
|
||||||
|
mov %RDI_LP, %RDX_LP
|
||||||
|
rep stosb
|
||||||
|
+# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
|
||||||
|
+ /* Use xchg to save 1-byte (this helps align targets below). */
|
||||||
|
+ xchg %RDX_LP, %RAX_LP
|
||||||
|
+# else
|
||||||
|
mov %RDX_LP, %RAX_LP
|
||||||
|
- VZEROUPPER_RETURN
|
||||||
|
# endif
|
||||||
|
+ VZEROUPPER_RETURN
|
||||||
|
+#endif
|
||||||
|
+#ifndef USE_LESS_VEC_MASK_STORE
|
||||||
|
/* Define L(less_vec) only if not otherwise defined. */
|
||||||
|
- .p2align 4
|
||||||
|
+ .p2align 4,, 12
|
||||||
|
L(less_vec):
|
||||||
|
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||||||
|
xmm). This is only does anything for AVX2. */
|
||||||
|
@@ -421,4 +425,35 @@ L(between_2_3):
|
||||||
|
movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
|
||||||
|
#endif
|
||||||
|
ret
|
||||||
|
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||||
|
+
|
||||||
|
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||||
|
+# ifdef USE_WITH_AVX512
|
||||||
|
+ /* Force align so the loop doesn't cross a cache-line. */
|
||||||
|
+ .p2align 4
|
||||||
|
+# endif
|
||||||
|
+ .p2align 4,, 7
|
||||||
|
+ /* Memset using non-temporal stores. */
|
||||||
|
+L(nt_memset):
|
||||||
|
+ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi)
|
||||||
|
+ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx
|
||||||
|
+ /* Align DST. */
|
||||||
|
+ orq $(VEC_SIZE * 1 - 1), %rdi
|
||||||
|
+ incq %rdi
|
||||||
|
+ .p2align 4,, 7
|
||||||
|
+L(nt_loop):
|
||||||
|
+ VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi)
|
||||||
|
+ VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi)
|
||||||
|
+ VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi)
|
||||||
|
+ VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi)
|
||||||
|
+ subq $(VEC_SIZE * -4), %rdi
|
||||||
|
+ cmpq %rdx, %rdi
|
||||||
|
+ jb L(nt_loop)
|
||||||
|
+ sfence
|
||||||
|
+ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx)
|
||||||
|
+ VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx)
|
||||||
|
+ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx)
|
||||||
|
+ VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx)
|
||||||
|
+ VZEROUPPER_RETURN
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+END(MEMSET_SYMBOL(__memset, unaligned_erms))
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
34
x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
Normal file
34
x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
From 5a64f933655384477d85122c6855dc6d84061810 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Wed, 1 Nov 2023 15:30:26 -0500
|
||||||
|
Subject: [PATCH] x86: Only align destination to 1x VEC_SIZE in memset 4x
|
||||||
|
loop
|
||||||
|
|
||||||
|
Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on
|
||||||
|
performance other than potentially resulting in an additional
|
||||||
|
iteration of the loop.
|
||||||
|
1x maintains aligned stores (the only reason to align in this case)
|
||||||
|
and doesn't incur any unnecessary loop iterations.
|
||||||
|
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
|
||||||
|
|
||||||
|
(cherry picked from commit 9469261cf1924d350feeec64d2c80cafbbdcdd4d)
|
||||||
|
---
|
||||||
|
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
index 3d9ad49cb9..0f0636b90f 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
@@ -293,7 +293,7 @@ L(more_2x_vec):
|
||||||
|
leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
|
||||||
|
#endif
|
||||||
|
/* Align dst for loop. */
|
||||||
|
- andq $(VEC_SIZE * -2), %LOOP_REG
|
||||||
|
+ andq $(VEC_SIZE * -1), %LOOP_REG
|
||||||
|
.p2align 4
|
||||||
|
L(loop):
|
||||||
|
VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
149
x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
Normal file
149
x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
From 12fec8aae5e17cc4dc3bb079265c46ee78faeddb Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Fri, 27 Sep 2024 15:50:10 -0700
|
||||||
|
Subject: [PATCH] x86/string: Fixup alignment of main loop in
|
||||||
|
str{n}cmp-evex [BZ #32212]
|
||||||
|
|
||||||
|
The loop should be aligned to 32-bytes so that it can ideally run out
|
||||||
|
the DSB. This is particularly important on Skylake-Server where
|
||||||
|
deficiencies in it's DSB implementation make it prone to not being
|
||||||
|
able to run loops out of the DSB.
|
||||||
|
|
||||||
|
For example running strcmp-evex on 200Mb string:
|
||||||
|
|
||||||
|
32-byte aligned loop:
|
||||||
|
- 43,399,578,766 idq.dsb_uops
|
||||||
|
not 32-byte aligned loop:
|
||||||
|
- 6,060,139,704 idq.dsb_uops
|
||||||
|
|
||||||
|
This results in a 25% performance degradation for the non-aligned
|
||||||
|
version.
|
||||||
|
|
||||||
|
The fix is to just ensure the code layout is such that the loop is
|
||||||
|
aligned. (Which was previously the case but was accidentally dropped
|
||||||
|
in 84e7c46df).
|
||||||
|
|
||||||
|
NB: The fix was actually 64-byte alignment. This is because 64-byte
|
||||||
|
alignment generally produces more stable performance than 32-byte
|
||||||
|
aligned code (cache line crosses can affect perf), so if we are going
|
||||||
|
past 16-byte alignmnent, might as well go to 64. 64-byte alignment
|
||||||
|
also matches most other functions we over-align, so it creates a
|
||||||
|
common point of optimization.
|
||||||
|
|
||||||
|
Times are reported as ratio of Time_With_Patch /
|
||||||
|
Time_Without_Patch. Lower is better.
|
||||||
|
|
||||||
|
The values being reported is the geometric mean of the ratio across
|
||||||
|
all tests in bench-strcmp and bench-strncmp.
|
||||||
|
|
||||||
|
Note this patch is only attempting to improve the Skylake-Server
|
||||||
|
strcmp for long strings. The rest of the numbers are only to test for
|
||||||
|
regressions.
|
||||||
|
|
||||||
|
Tigerlake Results Strings <= 512:
|
||||||
|
strcmp : 1.026
|
||||||
|
strncmp: 0.949
|
||||||
|
|
||||||
|
Tigerlake Results Strings > 512:
|
||||||
|
strcmp : 0.994
|
||||||
|
strncmp: 0.998
|
||||||
|
|
||||||
|
Skylake-Server Results Strings <= 512:
|
||||||
|
strcmp : 0.945
|
||||||
|
strncmp: 0.943
|
||||||
|
|
||||||
|
Skylake-Server Results Strings > 512:
|
||||||
|
strcmp : 0.778
|
||||||
|
strncmp: 1.000
|
||||||
|
|
||||||
|
The 2.6% regression on TGL-strcmp is due to slowdowns caused by
|
||||||
|
changes in alignment of code handling small sizes (most on the
|
||||||
|
page-cross logic). These should be safe to ignore because 1) We
|
||||||
|
previously only 16-byte aligned the function so this behavior is not
|
||||||
|
new and was essentially up to chance before this patch and 2) this
|
||||||
|
type of alignment related regression on small sizes really only comes
|
||||||
|
up in tight micro-benchmark loops and is unlikely to have any affect
|
||||||
|
on realworld performance.
|
||||||
|
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
(cherry picked from commit 483443d3211532903d7e790211af5a1d55fdb1f3)
|
||||||
|
---
|
||||||
|
sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++-------------
|
||||||
|
1 file changed, 13 insertions(+), 13 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||||
|
index ae39cdf217..6a7fec669e 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||||
|
@@ -209,7 +209,9 @@
|
||||||
|
returned. */
|
||||||
|
|
||||||
|
.section SECTION(.text), "ax", @progbits
|
||||||
|
- .align 16
|
||||||
|
+ /* Align 64 bytes here. This is to get the L(loop) block ideally
|
||||||
|
+ aligned for the DSB. */
|
||||||
|
+ .align 64
|
||||||
|
.type STRCMP, @function
|
||||||
|
.globl STRCMP
|
||||||
|
# ifdef USE_AS_STRCASECMP_L
|
||||||
|
@@ -509,9 +511,7 @@ L(ret4):
|
||||||
|
ret
|
||||||
|
# endif
|
||||||
|
|
||||||
|
- /* 32 byte align here ensures the main loop is ideally aligned
|
||||||
|
- for DSB. */
|
||||||
|
- .p2align 5
|
||||||
|
+ .p2align 4,, 4
|
||||||
|
L(more_3x_vec):
|
||||||
|
/* Safe to compare 4x vectors. */
|
||||||
|
VMOVU (VEC_SIZE)(%rdi), %VMM(0)
|
||||||
|
@@ -1426,10 +1426,9 @@ L(less_32_till_page):
|
||||||
|
L(ret_zero_page_cross_slow_case0):
|
||||||
|
xorl %eax, %eax
|
||||||
|
ret
|
||||||
|
-# endif
|
||||||
|
-
|
||||||
|
-
|
||||||
|
+# else
|
||||||
|
.p2align 4,, 10
|
||||||
|
+# endif
|
||||||
|
L(less_16_till_page):
|
||||||
|
cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
|
||||||
|
ja L(less_8_till_page)
|
||||||
|
@@ -1482,8 +1481,12 @@ L(less_16_till_page):
|
||||||
|
# endif
|
||||||
|
jmp L(prepare_loop_aligned)
|
||||||
|
|
||||||
|
-
|
||||||
|
-
|
||||||
|
+# ifndef USE_AS_STRNCMP
|
||||||
|
+ /* Fits in aligning bytes. */
|
||||||
|
+L(ret_zero_4_loop):
|
||||||
|
+ xorl %eax, %eax
|
||||||
|
+ ret
|
||||||
|
+# endif
|
||||||
|
|
||||||
|
.p2align 4,, 10
|
||||||
|
L(less_8_till_page):
|
||||||
|
@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs):
|
||||||
|
|
||||||
|
# ifdef USE_AS_STRNCMP
|
||||||
|
.p2align 4,, 2
|
||||||
|
+L(ret_zero_4_loop):
|
||||||
|
L(ret_zero_page_cross_slow_case1):
|
||||||
|
xorl %eax, %eax
|
||||||
|
ret
|
||||||
|
@@ -1586,10 +1590,6 @@ L(less_4_loop):
|
||||||
|
subq $-(CHAR_PER_VEC * 4), %rdx
|
||||||
|
# endif
|
||||||
|
jmp L(prepare_loop_aligned)
|
||||||
|
-
|
||||||
|
-L(ret_zero_4_loop):
|
||||||
|
- xorl %eax, %eax
|
||||||
|
- ret
|
||||||
|
L(ret_less_4_loop):
|
||||||
|
xorl %r8d, %eax
|
||||||
|
subl %r8d, %eax
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
135
x86_64-Add-expm1-with-FMA.patch
Normal file
135
x86_64-Add-expm1-with-FMA.patch
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
From b2a45f1eee39d67c1fff2d697d32857fb13c8c5d Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Fri, 11 Aug 2023 08:04:08 -0700
|
||||||
|
Subject: [PATCH] x86_64: Add expm1 with FMA
|
||||||
|
|
||||||
|
On Skylake, it improves expm1 bench performance by:
|
||||||
|
|
||||||
|
Before After Improvement
|
||||||
|
max 70.204 68.054 3%
|
||||||
|
min 20.709 16.2 22%
|
||||||
|
mean 22.1221 16.7367 24%
|
||||||
|
|
||||||
|
NB: Add
|
||||||
|
|
||||||
|
extern long double __expm1l (long double);
|
||||||
|
extern long double __expm1f128 (long double);
|
||||||
|
|
||||||
|
for __typeof (__expm1l) and __typeof (__expm1f128) when __expm1 is
|
||||||
|
defined since __expm1 may be expanded in their declarations which
|
||||||
|
causes the build failure.
|
||||||
|
|
||||||
|
(cherry picked from commit 1b214630ce6f7e0099b8b6f87246246739b079cf)
|
||||||
|
---
|
||||||
|
sysdeps/ieee754/dbl-64/s_expm1.c | 7 +++++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c | 10 ++++++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/s_expm1.c | 36 ++++++++++++++++++++++
|
||||||
|
4 files changed, 55 insertions(+)
|
||||||
|
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
|
||||||
|
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1.c
|
||||||
|
|
||||||
|
diff --git a/sysdeps/ieee754/dbl-64/s_expm1.c b/sysdeps/ieee754/dbl-64/s_expm1.c
|
||||||
|
index 8f1c95bd04..1cafeca9c0 100644
|
||||||
|
--- a/sysdeps/ieee754/dbl-64/s_expm1.c
|
||||||
|
+++ b/sysdeps/ieee754/dbl-64/s_expm1.c
|
||||||
|
@@ -130,6 +130,11 @@ static const double
|
||||||
|
4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
|
||||||
|
-2.01099218183624371326e-07 }; /* BE8AFDB7 6E09C32D */
|
||||||
|
|
||||||
|
+#ifndef SECTION
|
||||||
|
+# define SECTION
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+SECTION
|
||||||
|
double
|
||||||
|
__expm1 (double x)
|
||||||
|
{
|
||||||
|
@@ -258,4 +263,6 @@ __expm1 (double x)
|
||||||
|
}
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
+#ifndef __expm1
|
||||||
|
libm_alias_double (__expm1, expm1)
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
index f773255721..add339a876 100644
|
||||||
|
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
@@ -37,6 +37,7 @@ libm-sysdep_routines += \
|
||||||
|
e_log2-fma \
|
||||||
|
e_pow-fma \
|
||||||
|
s_atan-fma \
|
||||||
|
+ s_expm1-fma \
|
||||||
|
s_sin-fma \
|
||||||
|
s_sincos-fma \
|
||||||
|
s_tan-fma \
|
||||||
|
@@ -49,6 +50,7 @@ CFLAGS-e_log-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_log2-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_pow-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_atan-fma.c = -mfma -mavx2
|
||||||
|
+CFLAGS-s_expm1-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sin-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_tan-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sincos-fma.c = -mfma -mavx2
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..3ee2bd804e
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
|
||||||
|
@@ -0,0 +1,10 @@
|
||||||
|
+#define __expm1 __expm1_fma
|
||||||
|
+
|
||||||
|
+/* NB: __expm1 may be expanded to __expm1_fma in the following
|
||||||
|
+ prototypes. */
|
||||||
|
+extern long double __expm1l (long double);
|
||||||
|
+extern long double __expm1f128 (long double);
|
||||||
|
+
|
||||||
|
+#define SECTION __attribute__ ((section (".text.fma")))
|
||||||
|
+
|
||||||
|
+#include <sysdeps/ieee754/dbl-64/s_expm1.c>
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1.c b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..2cae83fb7f
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
|
||||||
|
@@ -0,0 +1,36 @@
|
||||||
|
+/* Multiple versions of expm1.
|
||||||
|
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <libm-alias-double.h>
|
||||||
|
+
|
||||||
|
+extern double __redirect_expm1 (double);
|
||||||
|
+
|
||||||
|
+#define SYMBOL_NAME expm1
|
||||||
|
+#include "ifunc-fma.h"
|
||||||
|
+
|
||||||
|
+libc_ifunc_redirected (__redirect_expm1, __expm1, IFUNC_SELECTOR ());
|
||||||
|
+libm_alias_double (__expm1, expm1)
|
||||||
|
+
|
||||||
|
+#define __expm1 __expm1_sse2
|
||||||
|
+
|
||||||
|
+/* NB: __expm1 may be expanded to __expm1_sse2 in the following
|
||||||
|
+ prototypes. */
|
||||||
|
+extern long double __expm1l (long double);
|
||||||
|
+extern long double __expm1f128 (long double);
|
||||||
|
+
|
||||||
|
+#include <sysdeps/ieee754/dbl-64/s_expm1.c>
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
140
x86_64-Add-log1p-with-FMA.patch
Normal file
140
x86_64-Add-log1p-with-FMA.patch
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
From c92946d9b29956be78ca4487264848714fd5d505 Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Thu, 17 Aug 2023 09:42:29 -0700
|
||||||
|
Subject: [PATCH] x86_64: Add log1p with FMA
|
||||||
|
|
||||||
|
On Skylake, it changes log1p bench performance by:
|
||||||
|
|
||||||
|
Before After Improvement
|
||||||
|
max 63.349 58.347 8%
|
||||||
|
min 4.448 5.651 -30%
|
||||||
|
mean 12.0674 10.336 14%
|
||||||
|
|
||||||
|
The minimum code path is
|
||||||
|
|
||||||
|
if (hx < 0x3FDA827A) /* x < 0.41422 */
|
||||||
|
{
|
||||||
|
if (__glibc_unlikely (ax >= 0x3ff00000)) /* x <= -1.0 */
|
||||||
|
{
|
||||||
|
...
|
||||||
|
}
|
||||||
|
if (__glibc_unlikely (ax < 0x3e200000)) /* |x| < 2**-29 */
|
||||||
|
{
|
||||||
|
math_force_eval (two54 + x); /* raise inexact */
|
||||||
|
if (ax < 0x3c900000) /* |x| < 2**-54 */
|
||||||
|
{
|
||||||
|
...
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return x - x * x * 0.5;
|
||||||
|
|
||||||
|
FMA and non-FMA code sequences look similar. Non-FMA version is slightly
|
||||||
|
faster. Since log1p is called by asinh and atanh, it improves asinh
|
||||||
|
performance by:
|
||||||
|
|
||||||
|
Before After Improvement
|
||||||
|
max 75.645 63.135 16%
|
||||||
|
min 10.074 10.071 0%
|
||||||
|
mean 15.9483 14.9089 6%
|
||||||
|
|
||||||
|
and improves atanh performance by:
|
||||||
|
|
||||||
|
Before After Improvement
|
||||||
|
max 91.768 75.081 18%
|
||||||
|
min 15.548 13.883 10%
|
||||||
|
mean 18.3713 16.8011 8%
|
||||||
|
|
||||||
|
(cherry picked from commit a8ecb126d4c26c52f4ad828c566afe4043a28155)
|
||||||
|
---
|
||||||
|
sysdeps/ieee754/dbl-64/s_log1p.c | 5 ++++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c | 4 +++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/s_log1p.c | 29 ++++++++++++++++++++++
|
||||||
|
4 files changed, 40 insertions(+)
|
||||||
|
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
|
||||||
|
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p.c
|
||||||
|
|
||||||
|
diff --git a/sysdeps/ieee754/dbl-64/s_log1p.c b/sysdeps/ieee754/dbl-64/s_log1p.c
|
||||||
|
index e6476a8260..eeb0af859f 100644
|
||||||
|
--- a/sysdeps/ieee754/dbl-64/s_log1p.c
|
||||||
|
+++ b/sysdeps/ieee754/dbl-64/s_log1p.c
|
||||||
|
@@ -99,6 +99,11 @@ static const double
|
||||||
|
|
||||||
|
static const double zero = 0.0;
|
||||||
|
|
||||||
|
+#ifndef SECTION
|
||||||
|
+# define SECTION
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+SECTION
|
||||||
|
double
|
||||||
|
__log1p (double x)
|
||||||
|
{
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
index add339a876..ea81753b70 100644
|
||||||
|
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
@@ -38,6 +38,7 @@ libm-sysdep_routines += \
|
||||||
|
e_pow-fma \
|
||||||
|
s_atan-fma \
|
||||||
|
s_expm1-fma \
|
||||||
|
+ s_log1p-fma \
|
||||||
|
s_sin-fma \
|
||||||
|
s_sincos-fma \
|
||||||
|
s_tan-fma \
|
||||||
|
@@ -51,6 +52,7 @@ CFLAGS-e_log2-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_pow-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_atan-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_expm1-fma.c = -mfma -mavx2
|
||||||
|
+CFLAGS-s_log1p-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sin-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_tan-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sincos-fma.c = -mfma -mavx2
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..8952df8f9e
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
|
||||||
|
@@ -0,0 +1,4 @@
|
||||||
|
+#define __log1p __log1p_fma
|
||||||
|
+#define SECTION __attribute__ ((section (".text.fma")))
|
||||||
|
+
|
||||||
|
+#include <sysdeps/ieee754/dbl-64/s_log1p.c>
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p.c b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..6ce5198d6d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
|
||||||
|
@@ -0,0 +1,29 @@
|
||||||
|
+/* Multiple versions of log1p.
|
||||||
|
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <libm-alias-double.h>
|
||||||
|
+
|
||||||
|
+extern double __redirect_log1p (double);
|
||||||
|
+
|
||||||
|
+#define SYMBOL_NAME log1p
|
||||||
|
+#include "ifunc-fma.h"
|
||||||
|
+
|
||||||
|
+libc_ifunc_redirected (__redirect_log1p, __log1p, IFUNC_SELECTOR ());
|
||||||
|
+
|
||||||
|
+#define __log1p __log1p_sse2
|
||||||
|
+#include <sysdeps/ieee754/dbl-64/s_log1p.c>
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
102
x86_64-Add-log2-with-FMA.patch
Normal file
102
x86_64-Add-log2-with-FMA.patch
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
From 49016f2190693d5b2d4d6294d438ebae7a58d151 Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Thu, 10 Aug 2023 11:24:30 -0700
|
||||||
|
Subject: [PATCH] x86_64: Add log2 with FMA
|
||||||
|
|
||||||
|
On Skylake, it improves log2 bench performance by:
|
||||||
|
|
||||||
|
Before After Improvement
|
||||||
|
max 208.779 63.827 69%
|
||||||
|
min 9.977 6.55 34%
|
||||||
|
mean 10.366 6.8191 34%
|
||||||
|
|
||||||
|
(cherry picked from commit f6b10ed8e9a00de49d0951e760cc2b5288862b47)
|
||||||
|
---
|
||||||
|
sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/e_log2-fma.c | 3 ++
|
||||||
|
sysdeps/x86_64/fpu/multiarch/e_log2.c | 43 +++++++++++++++++++++++
|
||||||
|
3 files changed, 48 insertions(+)
|
||||||
|
create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
|
||||||
|
create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2.c
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
index e37e488c37..f773255721 100644
|
||||||
|
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
@@ -34,6 +34,7 @@ libm-sysdep_routines += \
|
||||||
|
e_atan2-fma \
|
||||||
|
e_exp-fma \
|
||||||
|
e_log-fma \
|
||||||
|
+ e_log2-fma \
|
||||||
|
e_pow-fma \
|
||||||
|
s_atan-fma \
|
||||||
|
s_sin-fma \
|
||||||
|
@@ -45,6 +46,7 @@ CFLAGS-e_asin-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_atan2-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_exp-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_log-fma.c = -mfma -mavx2
|
||||||
|
+CFLAGS-e_log2-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_pow-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_atan-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sin-fma.c = -mfma -mavx2
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..9fbebc1b47
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
|
||||||
|
@@ -0,0 +1,3 @@
|
||||||
|
+#define __log2 __log2_fma
|
||||||
|
+
|
||||||
|
+#include <sysdeps/ieee754/dbl-64/e_log2.c>
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2.c b/sysdeps/x86_64/fpu/multiarch/e_log2.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000..c0320caf36
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/e_log2.c
|
||||||
|
@@ -0,0 +1,43 @@
|
||||||
|
+/* Multiple versions of log2.
|
||||||
|
+ Copyright (C) 2023 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <libm-alias-double.h>
|
||||||
|
+#include <libm-alias-finite.h>
|
||||||
|
+
|
||||||
|
+extern double __redirect_log2 (double);
|
||||||
|
+
|
||||||
|
+#define SYMBOL_NAME log2
|
||||||
|
+#include "ifunc-fma.h"
|
||||||
|
+
|
||||||
|
+libc_ifunc_redirected (__redirect_log2, __log2, IFUNC_SELECTOR ());
|
||||||
|
+
|
||||||
|
+#ifdef SHARED
|
||||||
|
+__hidden_ver1 (__log2, __GI___log2, __redirect_log2)
|
||||||
|
+ __attribute__ ((visibility ("hidden")));
|
||||||
|
+
|
||||||
|
+versioned_symbol (libm, __ieee754_log2, log2, GLIBC_2_29);
|
||||||
|
+libm_alias_double_other (__log2, log2)
|
||||||
|
+#else
|
||||||
|
+libm_alias_double (__log2, log2)
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+strong_alias (__log2, __ieee754_log2)
|
||||||
|
+libm_alias_finite (__log2, __log2)
|
||||||
|
+
|
||||||
|
+#define __log2 __log2_sse2
|
||||||
|
+#include <sysdeps/ieee754/dbl-64/e_log2.c>
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
44
x86_64-Fix-missing-wcsncat-function-definition-witho.patch
Normal file
44
x86_64-Fix-missing-wcsncat-function-definition-witho.patch
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
From dc1762113dbe40be832bedd41b52d9822d62c50f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Gabi Falk <gabifalk@gmx.com>
|
||||||
|
Date: Tue, 7 May 2024 18:25:00 +0000
|
||||||
|
Subject: [PATCH] x86_64: Fix missing wcsncat function definition without
|
||||||
|
multiarch (x86-64-v4)
|
||||||
|
|
||||||
|
This code expects the WCSCAT preprocessor macro to be predefined in case
|
||||||
|
the evex implementation of the function should be defined with a name
|
||||||
|
different from __wcsncat_evex. However, when glibc is built for
|
||||||
|
x86-64-v4 without multiarch support, sysdeps/x86_64/wcsncat.S defines
|
||||||
|
WCSNCAT variable instead of WCSCAT to build it as wcsncat. Rename the
|
||||||
|
variable to WCSNCAT, as it is actually a better naming choice for the
|
||||||
|
variable in this case.
|
||||||
|
|
||||||
|
Reported-by: Kenton Groombridge
|
||||||
|
Link: https://bugs.gentoo.org/921945
|
||||||
|
Fixes: 64b8b6516b ("x86: Add evex optimized functions for the wchar_t strcpy family")
|
||||||
|
Signed-off-by: Gabi Falk <gabifalk@gmx.com>
|
||||||
|
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
|
||||||
|
(cherry picked from commit dd5f891c1ad9f1b43b9db93afe2a55cbb7a6194e)
|
||||||
|
---
|
||||||
|
sysdeps/x86_64/multiarch/wcsncat-evex.S | 6 +++---
|
||||||
|
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
|
||||||
|
index 392215950a..10bfb0a531 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/wcsncat-evex.S
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
|
||||||
|
@@ -1,9 +1,9 @@
|
||||||
|
-#ifndef WCSCAT
|
||||||
|
-# define WCSCAT __wcsncat_evex
|
||||||
|
+#ifndef WCSNCAT
|
||||||
|
+# define WCSNCAT __wcsncat_evex
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define USE_AS_WCSCPY
|
||||||
|
#define USE_AS_STRCAT
|
||||||
|
|
||||||
|
-#define STRNCAT WCSCAT
|
||||||
|
+#define STRNCAT WCSNCAT
|
||||||
|
#include "strncat-evex.S"
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
144
x86_64-Sort-fpu-multiarch-Makefile.patch
Normal file
144
x86_64-Sort-fpu-multiarch-Makefile.patch
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
From 5c9be512ee25ceab92a284adc75fe22bbd94b179 Mon Sep 17 00:00:00 2001
|
||||||
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||||
|
Date: Wed, 9 Aug 2023 11:08:52 -0700
|
||||||
|
Subject: [PATCH] x86_64: Sort fpu/multiarch/Makefile
|
||||||
|
|
||||||
|
Sort Makefile variables using scripts/sort-makefile-lines.py.
|
||||||
|
|
||||||
|
No code generation changes observed in libm. No regressions on x86_64.
|
||||||
|
|
||||||
|
(cherry picked from commit 881546979d0219c18337e1b4f4d00cfacab13c40)
|
||||||
|
---
|
||||||
|
sysdeps/x86_64/fpu/multiarch/Makefile | 94 +++++++++++++++++++++------
|
||||||
|
1 file changed, 74 insertions(+), 20 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
index 248162525b..e37e488c37 100644
|
||||||
|
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||||
|
@@ -1,17 +1,45 @@
|
||||||
|
ifeq ($(subdir),math)
|
||||||
|
-libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
|
||||||
|
- s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
|
||||||
|
- s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ s_ceil-c \
|
||||||
|
+ s_ceilf-c \
|
||||||
|
+ s_floor-c \
|
||||||
|
+ s_floorf-c \
|
||||||
|
+ s_rint-c \
|
||||||
|
+ s_rintf-c \
|
||||||
|
+ s_nearbyint-c \
|
||||||
|
+ s_nearbyintf-c \
|
||||||
|
+ s_roundeven-c \
|
||||||
|
+ s_roundevenf-c \
|
||||||
|
+ s_trunc-c \
|
||||||
|
+ s_truncf-c \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
-libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
|
||||||
|
- s_floorf-sse4_1 s_nearbyint-sse4_1 \
|
||||||
|
- s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
|
||||||
|
- s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
||||||
|
- s_trunc-sse4_1 s_truncf-sse4_1
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ s_ceil-sse4_1 \
|
||||||
|
+ s_ceilf-sse4_1 \
|
||||||
|
+ s_floor-sse4_1 \
|
||||||
|
+ s_floorf-sse4_1 \
|
||||||
|
+ s_nearbyint-sse4_1 \
|
||||||
|
+ s_nearbyintf-sse4_1 \
|
||||||
|
+ s_roundeven-sse4_1 \
|
||||||
|
+ s_roundevenf-sse4_1 \
|
||||||
|
+ s_rint-sse4_1 \
|
||||||
|
+ s_rintf-sse4_1 \
|
||||||
|
+ s_trunc-sse4_1 \
|
||||||
|
+ s_truncf-sse4_1 \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
-libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
|
||||||
|
- e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
|
||||||
|
- s_sincos-fma
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ e_asin-fma \
|
||||||
|
+ e_atan2-fma \
|
||||||
|
+ e_exp-fma \
|
||||||
|
+ e_log-fma \
|
||||||
|
+ e_pow-fma \
|
||||||
|
+ s_atan-fma \
|
||||||
|
+ s_sin-fma \
|
||||||
|
+ s_sincos-fma \
|
||||||
|
+ s_tan-fma \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
CFLAGS-e_asin-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_atan2-fma.c = -mfma -mavx2
|
||||||
|
@@ -23,10 +51,22 @@ CFLAGS-s_sin-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_tan-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sincos-fma.c = -mfma -mavx2
|
||||||
|
|
||||||
|
-libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ s_cosf-sse2 \
|
||||||
|
+ s_sincosf-sse2 \
|
||||||
|
+ s_sinf-sse2 \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
-libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \
|
||||||
|
- e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ e_exp2f-fma \
|
||||||
|
+ e_expf-fma \
|
||||||
|
+ e_log2f-fma \
|
||||||
|
+ e_logf-fma \
|
||||||
|
+ e_powf-fma \
|
||||||
|
+ s_cosf-fma \
|
||||||
|
+ s_sincosf-fma \
|
||||||
|
+ s_sinf-fma \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-e_expf-fma.c = -mfma -mavx2
|
||||||
|
@@ -37,9 +77,17 @@ CFLAGS-s_sinf-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_cosf-fma.c = -mfma -mavx2
|
||||||
|
CFLAGS-s_sincosf-fma.c = -mfma -mavx2
|
||||||
|
|
||||||
|
-libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
|
||||||
|
- e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
|
||||||
|
- s_sincos-fma4
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ e_exp-fma4 \
|
||||||
|
+ e_log-fma4 \
|
||||||
|
+ e_pow-fma4 \
|
||||||
|
+ e_asin-fma4 \
|
||||||
|
+ s_atan-fma4 \
|
||||||
|
+ e_atan2-fma4 \
|
||||||
|
+ s_sin-fma4 \
|
||||||
|
+ s_sincos-fma4 \
|
||||||
|
+ s_tan-fma4 \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
CFLAGS-e_asin-fma4.c = -mfma4
|
||||||
|
CFLAGS-e_atan2-fma4.c = -mfma4
|
||||||
|
@@ -51,9 +99,15 @@ CFLAGS-s_sin-fma4.c = -mfma4
|
||||||
|
CFLAGS-s_tan-fma4.c = -mfma4
|
||||||
|
CFLAGS-s_sincos-fma4.c = -mfma4
|
||||||
|
|
||||||
|
-libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
|
||||||
|
- e_atan2-avx s_sin-avx s_tan-avx \
|
||||||
|
- s_sincos-avx
|
||||||
|
+libm-sysdep_routines += \
|
||||||
|
+ e_exp-avx \
|
||||||
|
+ e_log-avx \
|
||||||
|
+ s_atan-avx \
|
||||||
|
+ e_atan2-avx \
|
||||||
|
+ s_sin-avx \
|
||||||
|
+ s_sincos-avx \
|
||||||
|
+ s_tan-avx \
|
||||||
|
+# libm-sysdep_routines
|
||||||
|
|
||||||
|
CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
|
||||||
|
CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
|
||||||
|
--
|
||||||
|
2.27.0
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user