!983 [sync] PR-982: backport form glibc upstream 2.38 branch

From: @openeuler-sync-bot 
Reviewed-by: @liqingqing_1229 
Signed-off-by: @liqingqing_1229
This commit is contained in:
openeuler-ci-bot 2025-01-27 03:00:39 +00:00 committed by Gitee
commit de3e1f1fa3
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
16 changed files with 2353 additions and 1 deletions

View File

@ -0,0 +1,89 @@
From c32fd59314c343db88c3ea4a203870481d33c3d2 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue, 21 Jan 2025 16:11:06 -0500
Subject: [PATCH] Fix underallocation of abort_msg_s struct
(CVE-2025-0395)
Include the space needed to store the length of the message itself, in
addition to the message string. This resolves BZ #32582.
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578)
---
NEWS | 6 ++++++
assert/assert.c | 4 +++-
sysdeps/posix/libc_fatal.c | 4 +++-
3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/NEWS b/NEWS
index d0815514e0..3e511d6de4 100644
--- a/NEWS
+++ b/NEWS
@@ -34,6 +34,11 @@ Security related changes:
buffer overflow, which could be exploited to achieve escalated
privileges. This flaw was introduced in glibc 2.34.
+ CVE-2025-0395: When the assert() function fails, it does not allocate
+ enough space for the assertion failure message string and size
+ information, which may lead to a buffer overflow if the message string
+ size aligns to page size.
+
The following bugs are resolved with this release:
[27821] ungetc: Fix backup buffer leak on program exit
@@ -61,6 +66,7 @@ The following bugs are resolved with this release:
[32137] libio: Attempt wide backup free only for non-legacy code
[32231] elf: Change ldconfig auxcache magic number
[32470] x86: Avoid integer truncation with large cache sizes
+ [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395)
Version 2.38
diff --git a/assert/assert.c b/assert/assert.c
index b7c7a4a1ba..65a9fedf0d 100644
--- a/assert/assert.c
+++ b/assert/assert.c
@@ -18,6 +18,7 @@
#include <assert.h>
#include <atomic.h>
#include <ldsodefs.h>
+#include <libc-pointer-arith.h>
#include <libintl.h>
#include <stdio.h>
#include <stdlib.h>
@@ -64,7 +65,8 @@ __assert_fail_base (const char *fmt, const char *assertion, const char *file,
(void) __fxprintf (NULL, "%s", str);
(void) fflush (stderr);
- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
+ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
+ GLRO(dl_pagesize));
struct abort_msg_s *buf = __mmap (NULL, total, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
if (__glibc_likely (buf != MAP_FAILED))
diff --git a/sysdeps/posix/libc_fatal.c b/sysdeps/posix/libc_fatal.c
index 70edcc10c1..5b9e4b7918 100644
--- a/sysdeps/posix/libc_fatal.c
+++ b/sysdeps/posix/libc_fatal.c
@@ -20,6 +20,7 @@
#include <errno.h>
#include <fcntl.h>
#include <ldsodefs.h>
+#include <libc-pointer-arith.h>
#include <paths.h>
#include <stdarg.h>
#include <stdbool.h>
@@ -123,7 +124,8 @@ __libc_message (const char *fmt, ...)
WRITEV_FOR_FATAL (fd, iov, nlist, total);
- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
+ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
+ GLRO(dl_pagesize));
struct abort_msg_s *buf = __mmap (NULL, total,
PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
--
2.27.0

View File

@ -0,0 +1,50 @@
From 48642ef1a5721e0a7694d84fe46d83b6086dfe75 Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Mon, 3 Jun 2024 10:49:40 +0200
Subject: [PATCH] elf: Avoid some free (NULL) calls in
_dl_update_slotinfo
This has been confirmed to work around some interposed mallocs. Here
is a discussion of the impact test ust/libc-wrapper/test_libc-wrapper
in lttng-tools:
New TLS usage in libgcc_s.so.1, compatibility impact
<https://inbox.sourceware.org/libc-alpha/8734v1ieke.fsf@oldenburg.str.redhat.com/>
Reportedly, this patch also papers over a similar issue when tcmalloc
2.9.1 is not compiled with -ftls-model=initial-exec. Of course the
goal really should be to compile mallocs with the initial-exec TLS
model, but this commit appears to be a useful interim workaround.
Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
tls access after dlopen [BZ #19924]").
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit afe42e935b3ee97bac9a7064157587777259c60e)
---
elf/dl-tls.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 70446e71a8..de0168319c 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -819,7 +819,14 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
dtv entry free it. Note: this is not AS-safe. */
/* XXX Ideally we will at some point create a memory
pool. */
- free (dtv[modid].pointer.to_free);
+ /* Avoid calling free on a null pointer. Some mallocs
+ incorrectly use dynamic TLS, and depending on how the
+ free function was compiled, it could call
+ __tls_get_addr before the null pointer check in the
+ free implementation. Checking here papers over at
+ least some dynamic TLS usage by interposed mallocs. */
+ if (dtv[modid].pointer.to_free != NULL)
+ free (dtv[modid].pointer.to_free);
dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
dtv[modid].pointer.to_free = NULL;
--
2.27.0

View File

@ -0,0 +1,328 @@
From 7772f9358c9a947251196ea7844b339f0a423ff6 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Date: Tue, 16 Feb 2021 12:55:13 +0000
Subject: [PATCH] elf: Fix slow tls access after dlopen [BZ #19924]
In short: __tls_get_addr checks the global generation counter and if
the current dtv is older then _dl_update_slotinfo updates dtv up to the
generation of the accessed module. So if the global generation is newer
than generation of the module then __tls_get_addr keeps hitting the
slow dtv update path. The dtv update path includes a number of checks
to see if any update is needed and this already causes measurable tls
access slow down after dlopen.
It may be possible to detect up-to-date dtv faster. But if there are
many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at
least walking the slotinfo list.
This patch tries to update the dtv to the global generation instead, so
after a dlopen the tls access slow path is only hit once. The modules
with larger generation than the accessed one were not necessarily
synchronized before, so additional synchronization is needed.
This patch uses acquire/release synchronization when accessing the
generation counter.
Note: in the x86_64 version of dl-tls.c the generation is only loaded
once, since relaxed mo is not faster than acquire mo load.
I have not benchmarked this. Tested by Adhemerval Zanella on aarch64,
powerpc, sparc, x86 who reported that it fixes the performance issue
of bug 19924.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit d2123d68275acc0f061e73d5f86ca504e0d5a344)
---
elf/dl-close.c | 2 +-
elf/dl-open.c | 8 +--
elf/dl-reloc.c | 6 +-
elf/dl-tls.c | 117 ++++++++++++++++++++-----------------
sysdeps/generic/ldsodefs.h | 3 +-
sysdeps/x86_64/dl-tls.c | 4 +-
6 files changed, 74 insertions(+), 66 deletions(-)
diff --git a/elf/dl-close.c b/elf/dl-close.c
index b887a44888..1c7a861db1 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -703,7 +703,7 @@ _dl_close_worker (struct link_map *map, bool force)
if (__glibc_unlikely (newgen == 0))
_dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n");
/* Can be read concurrently. */
- atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+ atomic_store_release (&GL(dl_tls_generation), newgen);
if (tls_free_end == GL(dl_tls_static_used))
GL(dl_tls_static_used) = tls_free_start;
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 2d985e21d8..351931af04 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -405,7 +405,7 @@ update_tls_slotinfo (struct link_map *new)
_dl_fatal_printf (N_("\
TLS generation counter wrapped! Please report this."));
/* Can be read concurrently. */
- atomic_store_relaxed (&GL(dl_tls_generation), newgen);
+ atomic_store_release (&GL(dl_tls_generation), newgen);
/* We need a second pass for static tls data, because
_dl_update_slotinfo must not be run while calls to
@@ -422,8 +422,8 @@ TLS generation counter wrapped! Please report this."));
now, but we can delay updating the DTV. */
imap->l_need_tls_init = 0;
#ifdef SHARED
- /* Update the slot information data for at least the
- generation of the DSO we are allocating data for. */
+ /* Update the slot information data for the current
+ generation. */
/* FIXME: This can terminate the process on memory
allocation failure. It is not possible to raise
@@ -431,7 +431,7 @@ TLS generation counter wrapped! Please report this."));
_dl_update_slotinfo would have to be split into two
operations, similar to resize_scopes and update_scopes
above. This is related to bug 16134. */
- _dl_update_slotinfo (imap->l_tls_modid);
+ _dl_update_slotinfo (imap->l_tls_modid, newgen);
#endif
dl_init_static_tls (imap);
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
index 1d558c1e0c..e5c555d82c 100644
--- a/elf/dl-reloc.c
+++ b/elf/dl-reloc.c
@@ -112,11 +112,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional)
if (map->l_real->l_relocated)
{
#ifdef SHARED
+ /* Update the DTV of the current thread. Note: GL(dl_load_tls_lock)
+ is held here so normal load of the generation counter is valid. */
if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
0))
- /* Update the slot information data for at least the generation of
- the DSO we are allocating data for. */
- (void) _dl_update_slotinfo (map->l_tls_modid);
+ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
#endif
dl_init_static_tls (map);
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 1f6f820819..70446e71a8 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map)
struct link_map *
-_dl_update_slotinfo (unsigned long int req_modid)
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
{
struct link_map *the_map = NULL;
dtv_t *dtv = THREAD_DTV ();
- /* The global dl_tls_dtv_slotinfo array contains for each module
- index the generation counter current when the entry was created.
+ /* CONCURRENCY NOTES:
+
+ The global dl_tls_dtv_slotinfo_list array contains for each module
+ index the generation counter current when that entry was updated.
This array never shrinks so that all module indices which were
- valid at some time can be used to access it. Before the first
- use of a new module index in this function the array was extended
- appropriately. Access also does not have to be guarded against
- modifications of the array. It is assumed that pointer-size
- values can be read atomically even in SMP environments. It is
- possible that other threads at the same time dynamically load
- code and therefore add to the slotinfo list. This is a problem
- since we must not pick up any information about incomplete work.
- The solution to this is to ignore all dtv slots which were
- created after the one we are currently interested. We know that
- dynamic loading for this module is completed and this is the last
- load operation we know finished. */
- unsigned long int idx = req_modid;
+ valid at some time can be used to access it. Concurrent loading
+ and unloading of modules can update slotinfo entries or extend
+ the array. The updates happen under the GL(dl_load_tls_lock) and
+ finish with the release store of the generation counter to
+ GL(dl_tls_generation) which is synchronized with the load of
+ new_gen in the caller. So updates up to new_gen are synchronized
+ but updates for later generations may not be.
+
+ Here we update the thread dtv from old_gen (== dtv[0].counter) to
+ new_gen generation. For this, each dtv[i] entry is either set to
+ an unallocated state (set), or left unmodified (nop). Where (set)
+ may resize the dtv first if modid i >= dtv[-1].counter. The rules
+ for the decision between (set) and (nop) are
+
+ (1) If slotinfo entry i is concurrently updated then either (set)
+ or (nop) is valid: TLS access cannot use dtv[i] unless it is
+ synchronized with a generation > new_gen.
+
+ Otherwise, if the generation of slotinfo entry i is gen and the
+ loaded module for this entry is map then
+
+ (2) If gen <= old_gen then do (nop).
+
+ (3) If old_gen < gen <= new_gen then
+ (3.1) if map != 0 then (set)
+ (3.2) if map == 0 then either (set) or (nop).
+
+ Note that (1) cannot be reliably detected, but since both actions
+ are valid it does not have to be. Only (2) and (3.1) cases need
+ to be distinguished for which relaxed mo access of gen and map is
+ enough: their value is synchronized when it matters.
+
+ Note that a relaxed mo load may give an out-of-thin-air value since
+ it is used in decisions that can affect concurrent stores. But this
+ should only happen if the OOTA value causes UB that justifies the
+ concurrent store of the value. This is not expected to be an issue
+ in practice. */
struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
- while (idx >= listp->len)
+ if (dtv[0].counter < new_gen)
{
- idx -= listp->len;
- listp = listp->next;
- }
-
- if (dtv[0].counter < listp->slotinfo[idx].gen)
- {
- /* CONCURRENCY NOTES:
-
- Here the dtv needs to be updated to new_gen generation count.
-
- This code may be called during TLS access when GL(dl_load_tls_lock)
- is not held. In that case the user code has to synchronize with
- dlopen and dlclose calls of relevant modules. A module m is
- relevant if the generation of m <= new_gen and dlclose of m is
- synchronized: a memory access here happens after the dlopen and
- before the dlclose of relevant modules. The dtv entries for
- relevant modules need to be updated, other entries can be
- arbitrary.
-
- This e.g. means that the first part of the slotinfo list can be
- accessed race free, but the tail may be concurrently extended.
- Similarly relevant slotinfo entries can be read race free, but
- other entries are racy. However updating a non-relevant dtv
- entry does not affect correctness. For a relevant module m,
- max_modid >= modid of m. */
- size_t new_gen = listp->slotinfo[idx].gen;
size_t total = 0;
size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
assert (max_modid >= req_modid);
@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int req_modid)
{
size_t modid = total + cnt;
- /* Later entries are not relevant. */
+ /* Case (1) for all later modids. */
if (modid > max_modid)
break;
size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
+ /* Case (1). */
if (gen > new_gen)
- /* Not relevant. */
continue;
- /* If the entry is older than the current dtv layout we
- know we don't have to handle it. */
+ /* Case (2) or (1). */
if (gen <= dtv[0].counter)
continue;
+ /* Case (3) or (1). */
+
/* If there is no map this means the entry is empty. */
struct link_map *map
= atomic_load_relaxed (&listp->slotinfo[cnt].map);
/* Check whether the current dtv array is large enough. */
if (dtv[-1].counter < modid)
{
+ /* Case (3.2) or (1). */
if (map == NULL)
continue;
- /* Resize the dtv. */
+ /* Resizing the dtv aborts on failure: bug 16134. */
dtv = _dl_resize_dtv (dtv, max_modid);
assert (modid <= dtv[-1].counter);
@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int req_modid)
}
/* If there is currently memory allocate for this
- dtv entry free it. */
+ dtv entry free it. Note: this is not AS-safe. */
/* XXX Ideally we will at some point create a memory
pool. */
free (dtv[modid].pointer.to_free);
@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
static struct link_map *
__attribute_noinline__
-update_get_addr (GET_ADDR_ARGS)
+update_get_addr (GET_ADDR_ARGS, size_t gen)
{
- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
+ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
dtv_t *dtv = THREAD_DTV ();
void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS)
dtv_t *dtv = THREAD_DTV ();
/* Update is needed if dtv[0].counter < the generation of the accessed
- module. The global generation counter is used here as it is easier
- to check. Synchronization for the relaxed MO access is guaranteed
- by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */
+ module, but the global generation counter is easier to check (which
+ must be synchronized up to the generation of the accessed module by
+ user code doing the TLS access so relaxed mo read is enough). */
size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
if (__glibc_unlikely (dtv[0].counter != gen))
- return update_get_addr (GET_ADDR_PARAM);
+ {
+ /* Update DTV up to the global generation, see CONCURRENCY NOTES
+ in _dl_update_slotinfo. */
+ gen = atomic_load_acquire (&GL(dl_tls_generation));
+ return update_get_addr (GET_ADDR_PARAM, gen);
+ }
void *p = dtv[GET_ADDR_MODULE].pointer.val;
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index e8b7359b04..ed69c6babd 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1251,7 +1251,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
/* Update slot information data for at least the generation of the
module with the given index. */
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
+ size_t gen)
attribute_hidden;
/* Look up the module's TLS block as for __tls_get_addr,
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index 7a7fe38625..e9b6ab9970 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
{
dtv_t *dtv = THREAD_DTV ();
- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+ size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
if (__glibc_unlikely (dtv[0].counter != gen))
- return update_get_addr (GET_ADDR_PARAM);
+ return update_get_addr (GET_ADDR_PARAM, gen);
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
}
--
2.27.0

View File

@ -0,0 +1,521 @@
From 549e7f7c5a94f5ccbab2ad5f1babca05028a31c7 Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Mon, 1 Jul 2024 17:42:04 +0200
Subject: [PATCH] elf: Support recursive use of dynamic TLS in interposed
malloc
It turns out that quite a few applications use bundled mallocs that
have been built to use global-dynamic TLS (instead of the recommended
initial-exec TLS). The previous workaround from
commit afe42e935b3ee97bac9a7064157587777259c60e ("elf: Avoid some
free (NULL) calls in _dl_update_slotinfo") does not fix all
encountered cases unfortunatelly.
This change avoids the TLS generation update for recursive use
of TLS from a malloc that was called during a TLS update. This
is possible because an interposed malloc has a fixed module ID and
TLS slot. (It cannot be unloaded.) If an initially-loaded module ID
is encountered in __tls_get_addr and the dynamic linker is already
in the middle of a TLS update, use the outdated DTV, thus avoiding
another call into malloc. It's still necessary to update the
DTV to the most recent generation, to get out of the slow path,
which is why the check for recursion is needed.
The bookkeeping is done using a global counter instead of per-thread
flag because TLS access in the dynamic linker is tricky.
All this will go away once the dynamic linker stops using malloc
for TLS, likely as part of a change that pre-allocates all TLS
during pthread_create/dlopen.
Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
tls access after dlopen [BZ #19924]").
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
(cherry picked from commit 018f0fc3b818d4d1460a4e2384c24802504b1d20)
Conflict: adapt file "elf/Makefile" for patch "elf: Switch to main
malloc after final ld.so self-relocation"
---
elf/Makefile | 26 +++++++++
elf/dl-tls.c | 95 +++++++++++++++++++++++++++++---
elf/rtld.c | 2 +
elf/tst-recursive-tls.c | 60 ++++++++++++++++++++
elf/tst-recursive-tlsmallocmod.c | 64 +++++++++++++++++++++
elf/tst-recursive-tlsmodN.c | 28 ++++++++++
sysdeps/generic/ldsodefs.h | 14 +++++
sysdeps/x86_64/dl-tls.c | 5 +-
8 files changed, 284 insertions(+), 10 deletions(-)
create mode 100644 elf/tst-recursive-tls.c
create mode 100644 elf/tst-recursive-tlsmallocmod.c
create mode 100644 elf/tst-recursive-tlsmodN.c
diff --git a/elf/Makefile b/elf/Makefile
index ea98cba8..391f29e9 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -433,6 +433,7 @@ tests += \
tst-p_align1 \
tst-p_align2 \
tst-p_align3 \
+ tst-recursive-tls \
tst-relsort1 \
tst-ro-dynamic \
tst-rtld-no-malloc \
@@ -865,6 +866,23 @@ modules-names += \
tst-null-argv-lib \
tst-p_alignmod-base \
tst-p_alignmod3 \
+ tst-recursive-tlsmallocmod \
+ tst-recursive-tlsmod0 \
+ tst-recursive-tlsmod1 \
+ tst-recursive-tlsmod2 \
+ tst-recursive-tlsmod3 \
+ tst-recursive-tlsmod4 \
+ tst-recursive-tlsmod5 \
+ tst-recursive-tlsmod6 \
+ tst-recursive-tlsmod7 \
+ tst-recursive-tlsmod8 \
+ tst-recursive-tlsmod9 \
+ tst-recursive-tlsmod10 \
+ tst-recursive-tlsmod11 \
+ tst-recursive-tlsmod12 \
+ tst-recursive-tlsmod13 \
+ tst-recursive-tlsmod14 \
+ tst-recursive-tlsmod15 \
tst-relsort1mod1 \
tst-relsort1mod2 \
tst-ro-dynamic-mod \
@@ -3042,6 +3060,14 @@ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
endif
+$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so
+# More objects than DTV_SURPLUS, to trigger DTV reallocation.
+$(objpfx)tst-recursive-tls.out: \
+ $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c
+ $(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$*
+
# Reuse an audit module which provides ample debug logging.
tst-rtld-no-malloc-audit-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index de016831..59d4021e 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -75,6 +75,31 @@
/* Default for dl_tls_static_optional. */
#define OPTIONAL_TLS 512
+/* Used to count the number of threads currently executing dynamic TLS
+ updates. Used to avoid recursive malloc calls in __tls_get_addr
+ for an interposed malloc that uses global-dynamic TLS (which is not
+ recommended); see _dl_tls_allocate_active checks. This could be a
+ per-thread flag, but would need TLS access in the dynamic linker. */
+unsigned int _dl_tls_threads_in_update;
+
+static inline void
+_dl_tls_allocate_begin (void)
+{
+ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1);
+}
+
+static inline void
+_dl_tls_allocate_end (void)
+{
+ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1);
+}
+
+static inline bool
+_dl_tls_allocate_active (void)
+{
+ return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0;
+}
+
/* Compute the static TLS surplus based on the namespace count and the
TLS space that can be used for optimizations. */
static inline int
@@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void)
size += TLS_PRE_TCB_SIZE;
#endif
- /* Perform the allocation. Reserve space for the required alignment
- and the pointer to the original allocation. */
+ /* Reserve space for the required alignment and the pointer to the
+ original allocation. */
size_t alignment = GLRO (dl_tls_static_align);
+
+ /* Perform the allocation. */
+ _dl_tls_allocate_begin ();
void *allocated = malloc (size + alignment + sizeof (void *));
if (__glibc_unlikely (allocated == NULL))
- return NULL;
+ {
+ _dl_tls_allocate_end ();
+ return NULL;
+ }
/* Perform alignment and allocate the DTV. */
#if TLS_TCB_AT_TP
@@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void)
result = allocate_dtv (result);
if (result == NULL)
free (allocated);
+
+ _dl_tls_allocate_end ();
return result;
}
@@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
size_t newsize = max_modid + DTV_SURPLUS;
size_t oldsize = dtv[-1].counter;
+ _dl_tls_allocate_begin ();
if (dtv == GL(dl_initial_dtv))
{
/* This is the initial dtv that was either statically allocated in
@@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
if (newp == NULL)
oom ();
}
+ _dl_tls_allocate_end ();
newp[0].counter = newsize;
@@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size)
if (powerof2 (alignment) && alignment <= _Alignof (max_align_t))
{
/* The alignment is supported by malloc. */
+ _dl_tls_allocate_begin ();
void *ptr = malloc (size);
+ _dl_tls_allocate_end ();
return (struct dtv_pointer) { ptr, ptr };
}
@@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size)
/* Perform the allocation. This is the pointer we need to free
later. */
+ _dl_tls_allocate_begin ();
void *start = malloc (alloc_size);
+ _dl_tls_allocate_end ();
+
if (start == NULL)
return (struct dtv_pointer) {};
@@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
free implementation. Checking here papers over at
least some dynamic TLS usage by interposed mallocs. */
if (dtv[modid].pointer.to_free != NULL)
- free (dtv[modid].pointer.to_free);
+ {
+ _dl_tls_allocate_begin ();
+ free (dtv[modid].pointer.to_free);
+ _dl_tls_allocate_end ();
+ }
dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
dtv[modid].pointer.to_free = NULL;
@@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS)
size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
if (__glibc_unlikely (dtv[0].counter != gen))
{
- /* Update DTV up to the global generation, see CONCURRENCY NOTES
- in _dl_update_slotinfo. */
- gen = atomic_load_acquire (&GL(dl_tls_generation));
- return update_get_addr (GET_ADDR_PARAM, gen);
+ if (_dl_tls_allocate_active ()
+ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)
+ /* This is a reentrant __tls_get_addr call, but we can
+ satisfy it because it's an initially-loaded module ID.
+ These TLS slotinfo slots do not change, so the
+ out-of-date generation counter does not matter. However,
+ if not in a TLS update, still update_get_addr below, to
+ get off the slow path eventually. */
+ ;
+ else
+ {
+ /* Update DTV up to the global generation, see CONCURRENCY NOTES
+ in _dl_update_slotinfo. */
+ gen = atomic_load_acquire (&GL(dl_tls_generation));
+ return update_get_addr (GET_ADDR_PARAM, gen);
+ }
}
void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS)
return (char *) p + GET_ADDR_OFFSET;
}
-#endif
+#endif /* SHARED */
/* Look up the module's TLS block as for __tls_get_addr,
@@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l)
return data;
}
+size_t _dl_tls_initial_modid_limit;
+
+void
+_dl_tls_initial_modid_limit_setup (void)
+{
+ struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
+ size_t idx;
+ for (idx = 0; idx < listp->len; ++idx)
+ {
+ struct link_map *l = listp->slotinfo[idx].map;
+ if (l == NULL
+ /* The object can be unloaded, so its modid can be
+ reassociated. */
+ || !(l->l_type == lt_executable || l->l_type == lt_library))
+ break;
+ }
+ _dl_tls_initial_modid_limit = idx;
+}
+
void
_dl_add_to_slotinfo (struct link_map *l, bool do_add)
@@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
the first slot. */
assert (idx == 0);
+ _dl_tls_allocate_begin ();
listp = (struct dtv_slotinfo_list *)
malloc (sizeof (struct dtv_slotinfo_list)
+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
+ _dl_tls_allocate_end ();
if (listp == NULL)
{
/* We ran out of memory while resizing the dtv slotinfo list. */
diff --git a/elf/rtld.c b/elf/rtld.c
index 558733b8..0a1e202c 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -789,6 +789,8 @@ init_tls (size_t naudit)
_dl_fatal_printf ("\
cannot allocate TLS data structures for initial thread\n");
+ _dl_tls_initial_modid_limit_setup ();
+
/* Store for detection of the special case by __tls_get_addr
so it knows not to pass this dtv to the normal realloc. */
GL(dl_initial_dtv) = GET_DTV (tcbp);
diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c
new file mode 100644
index 00000000..716d1f78
--- /dev/null
+++ b/elf/tst-recursive-tls.c
@@ -0,0 +1,60 @@
+/* Test with interposed malloc with dynamic TLS.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+/* Defined in tst-recursive-tlsmallocmod.so. */
+extern __thread unsigned int malloc_subsytem_counter;
+
+static int
+do_test (void)
+{
+ /* 16 is large enough to exercise the DTV resizing case. */
+ void *handles[16];
+
+ for (unsigned int i = 0; i < array_length (handles); ++i)
+ {
+ /* Re-use the TLS slot for module 0. */
+ if (i > 0)
+ xdlclose (handles[0]);
+
+ char soname[30];
+ snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i);
+ handles[i] = xdlopen (soname, RTLD_NOW);
+
+ if (i > 0)
+ {
+ handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW);
+ int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0");
+ /* May trigger TLS storage allocation using malloc. */
+ TEST_COMPARE (fptr (), 0);
+ }
+ }
+
+ for (unsigned int i = 0; i < array_length (handles); ++i)
+ xdlclose (handles[i]);
+
+ printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter);
+ TEST_VERIFY (malloc_subsytem_counter > 0);
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c
new file mode 100644
index 00000000..c24e9945
--- /dev/null
+++ b/elf/tst-recursive-tlsmallocmod.c
@@ -0,0 +1,64 @@
+/* Interposed malloc with dynamic TLS.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include <dlfcn.h>
+
+__thread unsigned int malloc_subsytem_counter;
+
+static __typeof (malloc) *malloc_fptr;
+static __typeof (free) *free_fptr;
+static __typeof (calloc) *calloc_fptr;
+static __typeof (realloc) *realloc_fptr;
+
+static void __attribute__ ((constructor))
+init (void)
+{
+ malloc_fptr = dlsym (RTLD_NEXT, "malloc");
+ free_fptr = dlsym (RTLD_NEXT, "free");
+ calloc_fptr = dlsym (RTLD_NEXT, "calloc");
+ realloc_fptr = dlsym (RTLD_NEXT, "realloc");
+}
+
+void *
+malloc (size_t size)
+{
+ ++malloc_subsytem_counter;
+ return malloc_fptr (size);
+}
+
+void
+free (void *ptr)
+{
+ ++malloc_subsytem_counter;
+ return free_fptr (ptr);
+}
+
+void *
+calloc (size_t a, size_t b)
+{
+ ++malloc_subsytem_counter;
+ return calloc_fptr (a, b);
+}
+
+void *
+realloc (void *ptr, size_t size)
+{
+ ++malloc_subsytem_counter;
+ return realloc_fptr (ptr, size);
+}
diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c
new file mode 100644
index 00000000..bb7592ae
--- /dev/null
+++ b/elf/tst-recursive-tlsmodN.c
@@ -0,0 +1,28 @@
+/* Test module with global-dynamic TLS. Used to trigger DTV reallocation.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Compiled with VAR and FUNC set via -D. FUNC requires some
+ relocation against TLS variable VAR. */
+
+__thread int VAR;
+
+int
+FUNC (void)
+{
+ return VAR;
+}
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 22fbbecd..ad271ae0 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1262,6 +1262,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
size_t gen)
attribute_hidden;
+/* The last TLS module ID that is initially loaded, plus 1. TLS
+ addresses for modules with IDs lower than that can be obtained from
+ the DTV even if its generation is outdated. */
+extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro;
+
+/* Compute _dl_tls_initial_modid_limit. To be called after initial
+ relocation. */
+void _dl_tls_initial_modid_limit_setup (void) attribute_hidden;
+
+/* Number of threads currently in a TLS update. This is used to
+ detect reentrant __tls_get_addr calls without a per-thread
+ flag. */
+extern unsigned int _dl_tls_threads_in_update attribute_hidden;
+
/* Look up the module's TLS block as for __tls_get_addr,
but never touch anything. Return null if it's not allocated yet. */
extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden;
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
index e9b6ab99..c484f39e 100644
--- a/sysdeps/x86_64/dl-tls.c
+++ b/sysdeps/x86_64/dl-tls.c
@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
dtv_t *dtv = THREAD_DTV ();
size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
- if (__glibc_unlikely (dtv[0].counter != gen))
+ if (__glibc_unlikely (dtv[0].counter != gen)
+ /* See comment in __tls_get_addr in elf/dl-tls.c. */
+ && !(_dl_tls_allocate_active ()
+ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
return update_get_addr (GET_ADDR_PARAM, gen);
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
--
2.27.0

View File

@ -67,7 +67,7 @@
############################################################################## ##############################################################################
Name: glibc Name: glibc
Version: 2.38 Version: 2.38
Release: 51 Release: 52
Summary: The GNU libc libraries Summary: The GNU libc libraries
License: %{all_license} License: %{all_license}
URL: http://www.gnu.org/software/glibc/ URL: http://www.gnu.org/software/glibc/
@ -252,6 +252,21 @@ Patch162: nptl-initialize-rseq-area-prior-to-registration.patch
Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch
Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch
Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch
Patch166: x86_64-Sort-fpu-multiarch-Makefile.patch
Patch167: x86_64-Add-log2-with-FMA.patch
Patch168: x86_64-Add-expm1-with-FMA.patch
Patch169: x86_64-Add-log1p-with-FMA.patch
Patch170: x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
Patch171: elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
Patch172: x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
Patch173: sysdeps-x86-Makefile-Split-and-sort-tests.patch
Patch174: x86_64-Fix-missing-wcsncat-function-definition-witho.patch
Patch175: x86-Improve-large-memset-perf-with-non-temporal-stor.patch
Patch176: x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
#openEuler patch list #openEuler patch list
Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -1471,6 +1486,23 @@ fi
%endif %endif
%changelog %changelog
* Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
- stdlib: Test using setenv with updated environ [BZ #32588]
- Fix underallocation of abort_msg_s struct (CVE-2025-0395)
- elf: Support recursive use of dynamic TLS in interposed malloc
- elf: Avoid some free (NULL) calls in _dl_update_slotinfo
- x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212]
- x86: Improve large memset perf with non-temporal stores [RHEL-29312]
- x86_64: Fix missing wcsncat function definition without multiarch (x86-64-v4)
- sysdeps/x86/Makefile: Split and sort tests
- x86: Only align destination to 1x VEC_SIZE in memset 4x loop
- elf: Fix slow tls access after dlopen [BZ #19924]
- x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
- x86_64: Add log1p with FMA
- x86_64: Add expm1 with FMA
- x86_64: Add log2 with FMA
- x86_64: Sort fpu/multiarch/Makefile
* Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51 * Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51
- x86: Set preferred CPU features and default NT threshold for Zhaoxin processors - x86: Set preferred CPU features and default NT threshold for Zhaoxin processors

View File

@ -0,0 +1,75 @@
From 650a0aaaffa9ddb44732fa6156b31c5f30ee596f Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 24 Jan 2025 18:53:13 +0800
Subject: [PATCH] stdlib: Test using setenv with updated environ [BZ
#32588]
Add a test for setenv with updated environ. Verify that BZ #32588 is
fixed.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Florian Weimer <fweimer@redhat.com>
(cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
---
stdlib/Makefile | 1 +
stdlib/tst-setenv-environ.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+)
create mode 100644 stdlib/tst-setenv-environ.c
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 25e42a77e7..750810ee92 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -232,6 +232,7 @@ tests := \
tst-setcontext7 \
tst-setcontext8 \
tst-setcontext9 \
+ tst-setenv-environ \
tst-strfmon_l \
tst-strfrom \
tst-strfrom-locale \
diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
new file mode 100644
index 0000000000..02fcef96d0
--- /dev/null
+++ b/stdlib/tst-setenv-environ.c
@@ -0,0 +1,36 @@
+/* Test using setenv with updated environ.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include <support/check.h>
+
+extern char **environ;
+
+int
+do_test (void)
+{
+ char *valp;
+ static char *dummy_environ[] = { NULL };
+ environ = dummy_environ;
+ setenv ("A", "1", 0);
+ valp = getenv ("A");
+ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
+ return 0;
+}
+
+#include <support/test-driver.c>
--
2.27.0

View File

@ -0,0 +1,178 @@
From 0d14bf0754ee8d8cf2bf3dad298fa5c5f97537db Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 7 Dec 2023 09:00:11 -0800
Subject: [PATCH] sysdeps/x86/Makefile: Split and sort tests
Put each test on a separate line and sort tests.
(cherry picked from commit 7e03e0de7e7c2de975b5c5e18f5a4b0c75816674)
---
sysdeps/x86/Makefile | 110 ++++++++++++++++++++++++++++++-------------
1 file changed, 78 insertions(+), 32 deletions(-)
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 917c26f116..5631a59a26 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -10,36 +10,51 @@ sysdep_headers += sys/platform/x86.h bits/platform/x86.h
CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags)
CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector)
-tests += tst-get-cpu-features tst-get-cpu-features-static \
- tst-cpu-features-cpuinfo tst-cpu-features-cpuinfo-static \
- tst-cpu-features-supports tst-cpu-features-supports-static
-tests-static += tst-get-cpu-features-static \
- tst-cpu-features-cpuinfo-static \
- tst-cpu-features-supports-static
+tests += \
+ tst-get-cpu-features \
+ tst-get-cpu-features-static \
+ tst-cpu-features-cpuinfo \
+ tst-cpu-features-cpuinfo-static \
+ tst-cpu-features-supports \
+ tst-cpu-features-supports-static \
+# tests
+tests-static += \
+ tst-get-cpu-features-static \
+ tst-cpu-features-cpuinfo-static \
+ tst-cpu-features-supports-static \
+# tests-static
ifeq (yes,$(have-ifunc))
ifeq (yes,$(have-gcc-ifunc))
tests += \
tst-ifunc-isa-1 \
- tst-ifunc-isa-1-static
+ tst-ifunc-isa-1-static \
+# tests
tests-static += \
- tst-ifunc-isa-1-static
+ tst-ifunc-isa-1-static \
+# tests-static
test-xfail-tst-ifunc-isa-1 = $(with-lld)
test-xfail-tst-ifunc-isa-1-static = $(with-lld)
tests += \
tst-ifunc-isa-2 \
- tst-ifunc-isa-2-static
+ tst-ifunc-isa-2-static \
+# tests
tests-static += \
- tst-ifunc-isa-2-static
+ tst-ifunc-isa-2-static \
+# tests-static
test-xfail-tst-ifunc-isa-2 = $(with-lld)
test-xfail-tst-ifunc-isa-2-static = $(with-lld)
endif
endif
ifeq (yes,$(enable-x86-isa-level))
-tests += tst-isa-level-1
-modules-names += tst-isa-level-mod-1-baseline \
- tst-isa-level-mod-1-v2 \
- tst-isa-level-mod-1-v3 \
- tst-isa-level-mod-1-v4 \
+tests += \
+ tst-isa-level-1 \
+# tests
+modules-names += \
+ tst-isa-level-mod-1-baseline \
+ tst-isa-level-mod-1-v2 \
+ tst-isa-level-mod-1-v3 \
+ tst-isa-level-mod-1-v4 \
+# modules-names
# X86 ISA level baseline
CFLAGS-tst-isa-level-mod-1-baseline.c += -DINCLUDE_X86_ISA_LEVEL \
@@ -68,14 +83,18 @@ tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
endif
ifeq ($(subdir),math)
-tests += tst-ldbl-nonnormal-printf
+tests += \
+ tst-ldbl-nonnormal-printf \
+# tests
endif # $(subdir) == math
ifeq ($(subdir),setjmp)
gen-as-const-headers += jmp_buf-ssp.sym
sysdep_routines += __longjmp_cancel
ifneq ($(enable-cet),no)
-tests += tst-setjmp-cet
+tests += \
+ tst-setjmp-cet \
+# tests
tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
endif
endif
@@ -122,20 +141,45 @@ ifneq ($(enable-cet),no)
ifeq ($(subdir),elf)
sysdep-dl-routines += dl-cet
-tests += tst-cet-legacy-1 tst-cet-legacy-1a tst-cet-legacy-2 \
- tst-cet-legacy-2a tst-cet-legacy-3 tst-cet-legacy-4 \
- tst-cet-legacy-5a tst-cet-legacy-6a tst-cet-legacy-7 \
- tst-cet-legacy-8 tst-cet-legacy-9 tst-cet-legacy-9-static \
- tst-cet-legacy-10 tst-cet-legacy-10-static
-tests-static += tst-cet-legacy-9-static tst-cet-legacy-10-static
+tests += \
+ tst-cet-legacy-1 \
+ tst-cet-legacy-1a \
+ tst-cet-legacy-2 \
+ tst-cet-legacy-2a \
+ tst-cet-legacy-3 \
+ tst-cet-legacy-4 \
+ tst-cet-legacy-5a \
+ tst-cet-legacy-6a \
+ tst-cet-legacy-7 \
+ tst-cet-legacy-8 \
+ tst-cet-legacy-9 \
+ tst-cet-legacy-9-static \
+ tst-cet-legacy-10 \
+ tst-cet-legacy-10-static \
+# tests
+tests-static += \
+ tst-cet-legacy-9-static \
+ tst-cet-legacy-10-static \
+# tests-static
tst-cet-legacy-1a-ARGS = -- $(host-test-program-cmd)
-tests += tst-cet-legacy-4a tst-cet-legacy-4b tst-cet-legacy-4c \
- tst-cet-legacy-5b tst-cet-legacy-6b
-modules-names += tst-cet-legacy-mod-1 tst-cet-legacy-mod-2 \
- tst-cet-legacy-mod-4 tst-cet-legacy-mod-5a \
- tst-cet-legacy-mod-5b tst-cet-legacy-mod-5c \
- tst-cet-legacy-mod-6a tst-cet-legacy-mod-6b \
- tst-cet-legacy-mod-6c
+tests += \
+ tst-cet-legacy-4a \
+ tst-cet-legacy-4b \
+ tst-cet-legacy-4c \
+ tst-cet-legacy-5b \
+ tst-cet-legacy-6b \
+# tests
+modules-names += \
+ tst-cet-legacy-mod-1 \
+ tst-cet-legacy-mod-2 \
+ tst-cet-legacy-mod-4 \
+ tst-cet-legacy-mod-5a \
+ tst-cet-legacy-mod-5b \
+ tst-cet-legacy-mod-5c \
+ tst-cet-legacy-mod-6a \
+ tst-cet-legacy-mod-6b \
+ tst-cet-legacy-mod-6c \
+# modules-names
CFLAGS-tst-cet-legacy-2.c += -fcf-protection=branch
CFLAGS-tst-cet-legacy-2a.c += -fcf-protection
@@ -243,7 +287,9 @@ endif
ifeq ($(subdir),posix)
tests += \
tst-sysconf-cache-linesize \
- tst-sysconf-cache-linesize-static
+ tst-sysconf-cache-linesize-static \
+# tests
tests-static += \
- tst-sysconf-cache-linesize-static
+ tst-sysconf-cache-linesize-static \
+# tests-static
endif
--
2.27.0

View File

@ -0,0 +1,77 @@
From 58822f954f6284c8687dfff43fa4e9e349eeccad Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 28 Aug 2023 12:08:14 -0700
Subject: [PATCH] x86: Check the lower byte of EAX of CPUID leaf 2 [BZ
#30643]
The old Intel software developer manual specified that the low byte of
EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
CPUDID leaf 2 was needed to retrieve the complete cache information. The
newer Intel manual has been changed to that it should always return 1
and be ignored. If the lower byte isn't 1, CPUID leaf 2 can't be used.
In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead. If
CPUID leaf 4 doesn't contain the cache information, cache information
isn't available at all. This addresses BZ #30643.
(cherry picked from commit 1493622f4f9048ffede3fbedb64695efa49d662a)
---
sysdeps/x86/dl-cacheinfo.h | 31 +++++++++++++------------------
1 file changed, 13 insertions(+), 18 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 6c7740422a..400d15f208 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
++round;
}
/* There is no other cache information anywhere else. */
- break;
+ return -1;
}
else
{
@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
/* OK, we can use the CPUID instruction to get all info about the
caches. */
- unsigned int cnt = 0;
- unsigned int max = 1;
long int result = 0;
bool no_level_2_or_3 = false;
bool has_level_2 = false;
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ __cpuid (2, eax, ebx, ecx, edx);
- while (cnt++ < max)
+ /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
+ should be ignored. If it isn't 1, use CPUID leaf 4 instead. */
+ if ((eax & 0xff) != 1)
+ return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
+ cpu_features);
+ else
{
- unsigned int eax;
- unsigned int ebx;
- unsigned int ecx;
- unsigned int edx;
- __cpuid (2, eax, ebx, ecx, edx);
-
- /* The low byte of EAX in the first round contain the number of
- rounds we have to make. At least one, the one we are already
- doing. */
- if (cnt == 1)
- {
- max = eax & 0xff;
- eax &= 0xffffff00;
- }
+ eax &= 0xffffff00;
/* Process the individual registers' value. */
result = intel_check_word (name, eax, &has_level_2,
--
2.27.0

View File

@ -0,0 +1,254 @@
From 04b8d484323b2ff18b3422c4b883ef4cb6281c53 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 24 May 2024 12:38:50 -0500
Subject: [PATCH] x86: Improve large memset perf with non-temporal stores
[RHEL-29312]
Previously we use `rep stosb` for all medium/large memsets. This is
notably worse than non-temporal stores for large (above a
few MBs) memsets.
See:
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
For data using different stategies for large memset on ICX and SKX.
Using non-temporal stores can be up to 3x faster on ICX and 2x faster
on SKX. Historically, these numbers would not have been so good
because of the zero-over-zero writeback optimization that `rep stosb`
is able to do. But, the zero-over-zero writeback optimization has been
removed as a potential side-channel attack, so there is no longer any
good reason to only rely on `rep stosb` for large memsets. On the flip
size, non-temporal writes can avoid data in their RFO requests saving
memory bandwidth.
All of the other changes to the file are to re-organize the
code-blocks to maintain "good" alignment given the new code added in
the `L(stosb_local)` case.
The results from running the GLIBC memset benchmarks on TGL-client for
N=20 runs:
Geometric Mean across the suite New / Old EXEX256: 0.979
Geometric Mean across the suite New / Old EXEX512: 0.979
Geometric Mean across the suite New / Old AVX2 : 0.986
Geometric Mean across the suite New / Old SSE2 : 0.979
Most of the cases are essentially unchanged, this is mostly to show
that adding the non-temporal case didn't add any regressions to the
other cases.
The results on the memset-large benchmark suite on TGL-client for N=20
runs:
Geometric Mean across the suite New / Old EXEX256: 0.926
Geometric Mean across the suite New / Old EXEX512: 0.925
Geometric Mean across the suite New / Old AVX2 : 0.928
Geometric Mean across the suite New / Old SSE2 : 0.924
So roughly a 7.5% speedup. This is lower than what we see on servers
(likely because clients typically have faster single-core bandwidth so
saving bandwidth on RFOs is less impactful), but still advantageous.
Full test-suite passes on x86_64 w/ and w/o multiarch.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f)
---
.../multiarch/memset-vec-unaligned-erms.S | 147 +++++++++++-------
1 file changed, 91 insertions(+), 56 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 0f0636b90f..aba45e3da0 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,8 +21,13 @@
2. If size is less than VEC, use integer register stores.
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
- 4 VEC stores and store 4 * VEC at a time until done. */
+ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
+ 4 VEC stores and store 4 * VEC at a time until done.
+ 6. On machines ERMS feature, if size is range
+ [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+ then REP STOSB will be used.
+ 7. If size >= __x86_shared_non_temporal_threshold, use a
+ non-temporal stores. */
#include <sysdep.h>
@@ -145,6 +150,41 @@ L(entry_from_wmemset):
VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VMM(0), (%rdi)
VZEROUPPER_RETURN
+
+ /* If have AVX512 mask instructions put L(less_vec) close to
+ entry as it doesn't take much space and is likely a hot target. */
+#ifdef USE_LESS_VEC_MASK_STORE
+ /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */
+ .p2align 6,, 47
+ .p2align 4
+L(less_vec):
+L(less_vec_from_wmemset):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+# endif
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. Note that we are using rax which is set in
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
+ andl $(PAGE_SIZE - 1), %edi
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
+ serious performance degradation when it has to fault suppress. */
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
+ /* This is generally considered a cold target. */
+ ja L(cross_page)
+# if VEC_SIZE > 32
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+ kmovq %rcx, %k1
+# else
+ movl $-1, %ecx
+ bzhil %edx, %ecx, %ecx
+ kmovd %ecx, %k1
+# endif
+ vmovdqu8 %VMM(0), (%rax){%k1}
+ VZEROUPPER_RETURN
+#endif
+
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned))
@@ -183,54 +223,6 @@ L(last_2x_vec):
#endif
VZEROUPPER_RETURN
- /* If have AVX512 mask instructions put L(less_vec) close to
- entry as it doesn't take much space and is likely a hot target.
- */
-#ifdef USE_LESS_VEC_MASK_STORE
- .p2align 4,, 10
-L(less_vec):
-L(less_vec_from_wmemset):
- /* Less than 1 VEC. */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-# endif
- /* Clear high bits from edi. Only keeping bits relevant to page
- cross check. Note that we are using rax which is set in
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
- andl $(PAGE_SIZE - 1), %edi
- /* Check if VEC_SIZE store cross page. Mask stores suffer
- serious performance degradation when it has to fault suppress.
- */
- cmpl $(PAGE_SIZE - VEC_SIZE), %edi
- /* This is generally considered a cold target. */
- ja L(cross_page)
-# if VEC_SIZE > 32
- movq $-1, %rcx
- bzhiq %rdx, %rcx, %rcx
- kmovq %rcx, %k1
-# else
- movl $-1, %ecx
- bzhil %edx, %ecx, %ecx
- kmovd %ecx, %k1
-# endif
- vmovdqu8 %VMM(0), (%rax){%k1}
- VZEROUPPER_RETURN
-
-# if defined USE_MULTIARCH && IS_IN (libc)
- /* Include L(stosb_local) here if including L(less_vec) between
- L(stosb_more_2x_vec) and ENTRY. This is to cache align the
- L(stosb_more_2x_vec) target. */
- .p2align 4,, 10
-L(stosb_local):
- movzbl %sil, %eax
- mov %RDX_LP, %RCX_LP
- mov %RDI_LP, %RDX_LP
- rep stosb
- mov %RDX_LP, %RAX_LP
- VZEROUPPER_RETURN
-# endif
-#endif
-
#if defined USE_MULTIARCH && IS_IN (libc)
.p2align 4
L(stosb_more_2x_vec):
@@ -316,21 +308,33 @@ L(return_vzeroupper):
ret
#endif
- .p2align 4,, 10
-#ifndef USE_LESS_VEC_MASK_STORE
-# if defined USE_MULTIARCH && IS_IN (libc)
+#ifdef USE_WITH_AVX2
+ .p2align 4
+#else
+ .p2align 4,, 4
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
range for 2-byte jump encoding. */
L(stosb_local):
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ jae L(nt_memset)
movzbl %sil, %eax
mov %RDX_LP, %RCX_LP
mov %RDI_LP, %RDX_LP
rep stosb
+# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
+ /* Use xchg to save 1-byte (this helps align targets below). */
+ xchg %RDX_LP, %RAX_LP
+# else
mov %RDX_LP, %RAX_LP
- VZEROUPPER_RETURN
# endif
+ VZEROUPPER_RETURN
+#endif
+#ifndef USE_LESS_VEC_MASK_STORE
/* Define L(less_vec) only if not otherwise defined. */
- .p2align 4
+ .p2align 4,, 12
L(less_vec):
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
xmm). This is only does anything for AVX2. */
@@ -421,4 +425,35 @@ L(between_2_3):
movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
#endif
ret
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+# ifdef USE_WITH_AVX512
+ /* Force align so the loop doesn't cross a cache-line. */
+ .p2align 4
+# endif
+ .p2align 4,, 7
+ /* Memset using non-temporal stores. */
+L(nt_memset):
+ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi)
+ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx
+ /* Align DST. */
+ orq $(VEC_SIZE * 1 - 1), %rdi
+ incq %rdi
+ .p2align 4,, 7
+L(nt_loop):
+ VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi)
+ VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi)
+ VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi)
+ VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi)
+ subq $(VEC_SIZE * -4), %rdi
+ cmpq %rdx, %rdi
+ jb L(nt_loop)
+ sfence
+ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx)
+ VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx)
+ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx)
+ VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx)
+ VZEROUPPER_RETURN
+#endif
+
+END(MEMSET_SYMBOL(__memset, unaligned_erms))
--
2.27.0

View File

@ -0,0 +1,34 @@
From 5a64f933655384477d85122c6855dc6d84061810 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 1 Nov 2023 15:30:26 -0500
Subject: [PATCH] x86: Only align destination to 1x VEC_SIZE in memset 4x
loop
Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on
performance other than potentially resulting in an additional
iteration of the loop.
1x maintains aligned stores (the only reason to align in this case)
and doesn't incur any unnecessary loop iterations.
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit 9469261cf1924d350feeec64d2c80cafbbdcdd4d)
---
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 3d9ad49cb9..0f0636b90f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -293,7 +293,7 @@ L(more_2x_vec):
leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
#endif
/* Align dst for loop. */
- andq $(VEC_SIZE * -2), %LOOP_REG
+ andq $(VEC_SIZE * -1), %LOOP_REG
.p2align 4
L(loop):
VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
--
2.27.0

View File

@ -0,0 +1,149 @@
From 12fec8aae5e17cc4dc3bb079265c46ee78faeddb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 27 Sep 2024 15:50:10 -0700
Subject: [PATCH] x86/string: Fixup alignment of main loop in
str{n}cmp-evex [BZ #32212]
The loop should be aligned to 32-bytes so that it can ideally run out
the DSB. This is particularly important on Skylake-Server where
deficiencies in it's DSB implementation make it prone to not being
able to run loops out of the DSB.
For example running strcmp-evex on 200Mb string:
32-byte aligned loop:
- 43,399,578,766 idq.dsb_uops
not 32-byte aligned loop:
- 6,060,139,704 idq.dsb_uops
This results in a 25% performance degradation for the non-aligned
version.
The fix is to just ensure the code layout is such that the loop is
aligned. (Which was previously the case but was accidentally dropped
in 84e7c46df).
NB: The fix was actually 64-byte alignment. This is because 64-byte
alignment generally produces more stable performance than 32-byte
aligned code (cache line crosses can affect perf), so if we are going
past 16-byte alignmnent, might as well go to 64. 64-byte alignment
also matches most other functions we over-align, so it creates a
common point of optimization.
Times are reported as ratio of Time_With_Patch /
Time_Without_Patch. Lower is better.
The values being reported is the geometric mean of the ratio across
all tests in bench-strcmp and bench-strncmp.
Note this patch is only attempting to improve the Skylake-Server
strcmp for long strings. The rest of the numbers are only to test for
regressions.
Tigerlake Results Strings <= 512:
strcmp : 1.026
strncmp: 0.949
Tigerlake Results Strings > 512:
strcmp : 0.994
strncmp: 0.998
Skylake-Server Results Strings <= 512:
strcmp : 0.945
strncmp: 0.943
Skylake-Server Results Strings > 512:
strcmp : 0.778
strncmp: 1.000
The 2.6% regression on TGL-strcmp is due to slowdowns caused by
changes in alignment of code handling small sizes (most on the
page-cross logic). These should be safe to ignore because 1) We
previously only 16-byte aligned the function so this behavior is not
new and was essentially up to chance before this patch and 2) this
type of alignment related regression on small sizes really only comes
up in tight micro-benchmark loops and is unlikely to have any affect
on realworld performance.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 483443d3211532903d7e790211af5a1d55fdb1f3)
---
sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index ae39cdf217..6a7fec669e 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -209,7 +209,9 @@
returned. */
.section SECTION(.text), "ax", @progbits
- .align 16
+ /* Align 64 bytes here. This is to get the L(loop) block ideally
+ aligned for the DSB. */
+ .align 64
.type STRCMP, @function
.globl STRCMP
# ifdef USE_AS_STRCASECMP_L
@@ -509,9 +511,7 @@ L(ret4):
ret
# endif
- /* 32 byte align here ensures the main loop is ideally aligned
- for DSB. */
- .p2align 5
+ .p2align 4,, 4
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU (VEC_SIZE)(%rdi), %VMM(0)
@@ -1426,10 +1426,9 @@ L(less_32_till_page):
L(ret_zero_page_cross_slow_case0):
xorl %eax, %eax
ret
-# endif
-
-
+# else
.p2align 4,, 10
+# endif
L(less_16_till_page):
cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
ja L(less_8_till_page)
@@ -1482,8 +1481,12 @@ L(less_16_till_page):
# endif
jmp L(prepare_loop_aligned)
-
-
+# ifndef USE_AS_STRNCMP
+ /* Fits in aligning bytes. */
+L(ret_zero_4_loop):
+ xorl %eax, %eax
+ ret
+# endif
.p2align 4,, 10
L(less_8_till_page):
@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs):
# ifdef USE_AS_STRNCMP
.p2align 4,, 2
+L(ret_zero_4_loop):
L(ret_zero_page_cross_slow_case1):
xorl %eax, %eax
ret
@@ -1586,10 +1590,6 @@ L(less_4_loop):
subq $-(CHAR_PER_VEC * 4), %rdx
# endif
jmp L(prepare_loop_aligned)
-
-L(ret_zero_4_loop):
- xorl %eax, %eax
- ret
L(ret_less_4_loop):
xorl %r8d, %eax
subl %r8d, %eax
--
2.27.0

View File

@ -0,0 +1,135 @@
From b2a45f1eee39d67c1fff2d697d32857fb13c8c5d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 11 Aug 2023 08:04:08 -0700
Subject: [PATCH] x86_64: Add expm1 with FMA
On Skylake, it improves expm1 bench performance by:
Before After Improvement
max 70.204 68.054 3%
min 20.709 16.2 22%
mean 22.1221 16.7367 24%
NB: Add
extern long double __expm1l (long double);
extern long double __expm1f128 (long double);
for __typeof (__expm1l) and __typeof (__expm1f128) when __expm1 is
defined since __expm1 may be expanded in their declarations which
causes the build failure.
(cherry picked from commit 1b214630ce6f7e0099b8b6f87246246739b079cf)
---
sysdeps/ieee754/dbl-64/s_expm1.c | 7 +++++
sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++
sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c | 10 ++++++
sysdeps/x86_64/fpu/multiarch/s_expm1.c | 36 ++++++++++++++++++++++
4 files changed, 55 insertions(+)
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1.c
diff --git a/sysdeps/ieee754/dbl-64/s_expm1.c b/sysdeps/ieee754/dbl-64/s_expm1.c
index 8f1c95bd04..1cafeca9c0 100644
--- a/sysdeps/ieee754/dbl-64/s_expm1.c
+++ b/sysdeps/ieee754/dbl-64/s_expm1.c
@@ -130,6 +130,11 @@ static const double
4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
-2.01099218183624371326e-07 }; /* BE8AFDB7 6E09C32D */
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__expm1 (double x)
{
@@ -258,4 +263,6 @@ __expm1 (double x)
}
return y;
}
+#ifndef __expm1
libm_alias_double (__expm1, expm1)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index f773255721..add339a876 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -37,6 +37,7 @@ libm-sysdep_routines += \
e_log2-fma \
e_pow-fma \
s_atan-fma \
+ s_expm1-fma \
s_sin-fma \
s_sincos-fma \
s_tan-fma \
@@ -49,6 +50,7 @@ CFLAGS-e_log-fma.c = -mfma -mavx2
CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
+CFLAGS-s_expm1-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
new file mode 100644
index 0000000000..3ee2bd804e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
@@ -0,0 +1,10 @@
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_expm1.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1.c b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
new file mode 100644
index 0000000000..2cae83fb7f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
@@ -0,0 +1,36 @@
+/* Multiple versions of expm1.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+
+extern double __redirect_expm1 (double);
+
+#define SYMBOL_NAME expm1
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_expm1, __expm1, IFUNC_SELECTOR ());
+libm_alias_double (__expm1, expm1)
+
+#define __expm1 __expm1_sse2
+
+/* NB: __expm1 may be expanded to __expm1_sse2 in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#include <sysdeps/ieee754/dbl-64/s_expm1.c>
--
2.27.0

View File

@ -0,0 +1,140 @@
From c92946d9b29956be78ca4487264848714fd5d505 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 17 Aug 2023 09:42:29 -0700
Subject: [PATCH] x86_64: Add log1p with FMA
On Skylake, it changes log1p bench performance by:
Before After Improvement
max 63.349 58.347 8%
min 4.448 5.651 -30%
mean 12.0674 10.336 14%
The minimum code path is
if (hx < 0x3FDA827A) /* x < 0.41422 */
{
if (__glibc_unlikely (ax >= 0x3ff00000)) /* x <= -1.0 */
{
...
}
if (__glibc_unlikely (ax < 0x3e200000)) /* |x| < 2**-29 */
{
math_force_eval (two54 + x); /* raise inexact */
if (ax < 0x3c900000) /* |x| < 2**-54 */
{
...
}
else
return x - x * x * 0.5;
FMA and non-FMA code sequences look similar. Non-FMA version is slightly
faster. Since log1p is called by asinh and atanh, it improves asinh
performance by:
Before After Improvement
max 75.645 63.135 16%
min 10.074 10.071 0%
mean 15.9483 14.9089 6%
and improves atanh performance by:
Before After Improvement
max 91.768 75.081 18%
min 15.548 13.883 10%
mean 18.3713 16.8011 8%
(cherry picked from commit a8ecb126d4c26c52f4ad828c566afe4043a28155)
---
sysdeps/ieee754/dbl-64/s_log1p.c | 5 ++++
sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++
sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c | 4 +++
sysdeps/x86_64/fpu/multiarch/s_log1p.c | 29 ++++++++++++++++++++++
4 files changed, 40 insertions(+)
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p.c
diff --git a/sysdeps/ieee754/dbl-64/s_log1p.c b/sysdeps/ieee754/dbl-64/s_log1p.c
index e6476a8260..eeb0af859f 100644
--- a/sysdeps/ieee754/dbl-64/s_log1p.c
+++ b/sysdeps/ieee754/dbl-64/s_log1p.c
@@ -99,6 +99,11 @@ static const double
static const double zero = 0.0;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__log1p (double x)
{
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index add339a876..ea81753b70 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -38,6 +38,7 @@ libm-sysdep_routines += \
e_pow-fma \
s_atan-fma \
s_expm1-fma \
+ s_log1p-fma \
s_sin-fma \
s_sincos-fma \
s_tan-fma \
@@ -51,6 +52,7 @@ CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_expm1-fma.c = -mfma -mavx2
+CFLAGS-s_log1p-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
new file mode 100644
index 0000000000..8952df8f9e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
@@ -0,0 +1,4 @@
+#define __log1p __log1p_fma
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_log1p.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p.c b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
new file mode 100644
index 0000000000..6ce5198d6d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
@@ -0,0 +1,29 @@
+/* Multiple versions of log1p.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+
+extern double __redirect_log1p (double);
+
+#define SYMBOL_NAME log1p
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_log1p, __log1p, IFUNC_SELECTOR ());
+
+#define __log1p __log1p_sse2
+#include <sysdeps/ieee754/dbl-64/s_log1p.c>
--
2.27.0

View File

@ -0,0 +1,102 @@
From 49016f2190693d5b2d4d6294d438ebae7a58d151 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 10 Aug 2023 11:24:30 -0700
Subject: [PATCH] x86_64: Add log2 with FMA
On Skylake, it improves log2 bench performance by:
Before After Improvement
max 208.779 63.827 69%
min 9.977 6.55 34%
mean 10.366 6.8191 34%
(cherry picked from commit f6b10ed8e9a00de49d0951e760cc2b5288862b47)
---
sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++
sysdeps/x86_64/fpu/multiarch/e_log2-fma.c | 3 ++
sysdeps/x86_64/fpu/multiarch/e_log2.c | 43 +++++++++++++++++++++++
3 files changed, 48 insertions(+)
create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2.c
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index e37e488c37..f773255721 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -34,6 +34,7 @@ libm-sysdep_routines += \
e_atan2-fma \
e_exp-fma \
e_log-fma \
+ e_log2-fma \
e_pow-fma \
s_atan-fma \
s_sin-fma \
@@ -45,6 +46,7 @@ CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
CFLAGS-e_exp-fma.c = -mfma -mavx2
CFLAGS-e_log-fma.c = -mfma -mavx2
+CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
new file mode 100644
index 0000000000..9fbebc1b47
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
@@ -0,0 +1,3 @@
+#define __log2 __log2_fma
+
+#include <sysdeps/ieee754/dbl-64/e_log2.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2.c b/sysdeps/x86_64/fpu/multiarch/e_log2.c
new file mode 100644
index 0000000000..c0320caf36
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_log2.c
@@ -0,0 +1,43 @@
+/* Multiple versions of log2.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+#include <libm-alias-finite.h>
+
+extern double __redirect_log2 (double);
+
+#define SYMBOL_NAME log2
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_log2, __log2, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (__log2, __GI___log2, __redirect_log2)
+ __attribute__ ((visibility ("hidden")));
+
+versioned_symbol (libm, __ieee754_log2, log2, GLIBC_2_29);
+libm_alias_double_other (__log2, log2)
+#else
+libm_alias_double (__log2, log2)
+#endif
+
+strong_alias (__log2, __ieee754_log2)
+libm_alias_finite (__log2, __log2)
+
+#define __log2 __log2_sse2
+#include <sysdeps/ieee754/dbl-64/e_log2.c>
--
2.27.0

View File

@ -0,0 +1,44 @@
From dc1762113dbe40be832bedd41b52d9822d62c50f Mon Sep 17 00:00:00 2001
From: Gabi Falk <gabifalk@gmx.com>
Date: Tue, 7 May 2024 18:25:00 +0000
Subject: [PATCH] x86_64: Fix missing wcsncat function definition without
multiarch (x86-64-v4)
This code expects the WCSCAT preprocessor macro to be predefined in case
the evex implementation of the function should be defined with a name
different from __wcsncat_evex. However, when glibc is built for
x86-64-v4 without multiarch support, sysdeps/x86_64/wcsncat.S defines
WCSNCAT variable instead of WCSCAT to build it as wcsncat. Rename the
variable to WCSNCAT, as it is actually a better naming choice for the
variable in this case.
Reported-by: Kenton Groombridge
Link: https://bugs.gentoo.org/921945
Fixes: 64b8b6516b ("x86: Add evex optimized functions for the wchar_t strcpy family")
Signed-off-by: Gabi Falk <gabifalk@gmx.com>
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit dd5f891c1ad9f1b43b9db93afe2a55cbb7a6194e)
---
sysdeps/x86_64/multiarch/wcsncat-evex.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
index 392215950a..10bfb0a531 100644
--- a/sysdeps/x86_64/multiarch/wcsncat-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -1,9 +1,9 @@
-#ifndef WCSCAT
-# define WCSCAT __wcsncat_evex
+#ifndef WCSNCAT
+# define WCSNCAT __wcsncat_evex
#endif
#define USE_AS_WCSCPY
#define USE_AS_STRCAT
-#define STRNCAT WCSCAT
+#define STRNCAT WCSNCAT
#include "strncat-evex.S"
--
2.27.0

View File

@ -0,0 +1,144 @@
From 5c9be512ee25ceab92a284adc75fe22bbd94b179 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 9 Aug 2023 11:08:52 -0700
Subject: [PATCH] x86_64: Sort fpu/multiarch/Makefile
Sort Makefile variables using scripts/sort-makefile-lines.py.
No code generation changes observed in libm. No regressions on x86_64.
(cherry picked from commit 881546979d0219c18337e1b4f4d00cfacab13c40)
---
sysdeps/x86_64/fpu/multiarch/Makefile | 94 +++++++++++++++++++++------
1 file changed, 74 insertions(+), 20 deletions(-)
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 248162525b..e37e488c37 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,17 +1,45 @@
ifeq ($(subdir),math)
-libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
- s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
- s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+libm-sysdep_routines += \
+ s_ceil-c \
+ s_ceilf-c \
+ s_floor-c \
+ s_floorf-c \
+ s_rint-c \
+ s_rintf-c \
+ s_nearbyint-c \
+ s_nearbyintf-c \
+ s_roundeven-c \
+ s_roundevenf-c \
+ s_trunc-c \
+ s_truncf-c \
+# libm-sysdep_routines
-libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
- s_floorf-sse4_1 s_nearbyint-sse4_1 \
- s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
- s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
- s_trunc-sse4_1 s_truncf-sse4_1
+libm-sysdep_routines += \
+ s_ceil-sse4_1 \
+ s_ceilf-sse4_1 \
+ s_floor-sse4_1 \
+ s_floorf-sse4_1 \
+ s_nearbyint-sse4_1 \
+ s_nearbyintf-sse4_1 \
+ s_roundeven-sse4_1 \
+ s_roundevenf-sse4_1 \
+ s_rint-sse4_1 \
+ s_rintf-sse4_1 \
+ s_trunc-sse4_1 \
+ s_truncf-sse4_1 \
+# libm-sysdep_routines
-libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
- e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
- s_sincos-fma
+libm-sysdep_routines += \
+ e_asin-fma \
+ e_atan2-fma \
+ e_exp-fma \
+ e_log-fma \
+ e_pow-fma \
+ s_atan-fma \
+ s_sin-fma \
+ s_sincos-fma \
+ s_tan-fma \
+# libm-sysdep_routines
CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
@@ -23,10 +51,22 @@ CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
-libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
+libm-sysdep_routines += \
+ s_cosf-sse2 \
+ s_sincosf-sse2 \
+ s_sinf-sse2 \
+# libm-sysdep_routines
-libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \
- e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma
+libm-sysdep_routines += \
+ e_exp2f-fma \
+ e_expf-fma \
+ e_log2f-fma \
+ e_logf-fma \
+ e_powf-fma \
+ s_cosf-fma \
+ s_sincosf-fma \
+ s_sinf-fma \
+# libm-sysdep_routines
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
CFLAGS-e_expf-fma.c = -mfma -mavx2
@@ -37,9 +77,17 @@ CFLAGS-s_sinf-fma.c = -mfma -mavx2
CFLAGS-s_cosf-fma.c = -mfma -mavx2
CFLAGS-s_sincosf-fma.c = -mfma -mavx2
-libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
- e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
- s_sincos-fma4
+libm-sysdep_routines += \
+ e_exp-fma4 \
+ e_log-fma4 \
+ e_pow-fma4 \
+ e_asin-fma4 \
+ s_atan-fma4 \
+ e_atan2-fma4 \
+ s_sin-fma4 \
+ s_sincos-fma4 \
+ s_tan-fma4 \
+# libm-sysdep_routines
CFLAGS-e_asin-fma4.c = -mfma4
CFLAGS-e_atan2-fma4.c = -mfma4
@@ -51,9 +99,15 @@ CFLAGS-s_sin-fma4.c = -mfma4
CFLAGS-s_tan-fma4.c = -mfma4
CFLAGS-s_sincos-fma4.c = -mfma4
-libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
- e_atan2-avx s_sin-avx s_tan-avx \
- s_sincos-avx
+libm-sysdep_routines += \
+ e_exp-avx \
+ e_log-avx \
+ s_atan-avx \
+ e_atan2-avx \
+ s_sin-avx \
+ s_sincos-avx \
+ s_tan-avx \
+# libm-sysdep_routines
CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
--
2.27.0