!983 [sync] PR-982: backport form glibc upstream 2.38 branch

From: @openeuler-sync-bot Reviewed-by: @liqingqing_1229 Signed-off-by: @liqingqing_1229
2025-01-27 03:00:39 +00:00 · 2025-01-27 03:00:39 +00:00 · de3e1f1fa3
commit de3e1f1fa3
parent 8b46cd9a47 63c381c3fe
16 changed files with 2353 additions and 1 deletions
--- a/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
+++ b/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
@ -0,0 +1,89 @@
 From c32fd59314c343db88c3ea4a203870481d33c3d2 Mon Sep 17 00:00:00 2001
 From: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Date: Tue, 21 Jan 2025 16:11:06 -0500
 Subject: [PATCH] Fix underallocation of abort_msg_s struct
 (CVE-2025-0395)
 Include the space needed to store the length of the message itself, in
 addition to the message string.  This resolves BZ #32582.
 Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Reviewed: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 (cherry picked from commit 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578)
 ---
 NEWS                       | 6 ++++++
 assert/assert.c            | 4 +++-
 sysdeps/posix/libc_fatal.c | 4 +++-
 3 files changed, 12 insertions(+), 2 deletions(-)
 diff --git a/NEWS b/NEWS
 index d0815514e0..3e511d6de4 100644
 --- a/NEWS
 +++ b/NEWS
@@ -34,6 +34,11 @@ Security related changes:
   buffer overflow, which could be exploited to achieve escalated
   privileges.  This flaw was introduced in glibc 2.34.
 +  CVE-2025-0395: When the assert() function fails, it does not allocate
 +  enough space for the assertion failure message string and size
 +  information, which may lead to a buffer overflow if the message string
 +  size aligns to page size.
 +
 The following bugs are resolved with this release:
   [27821] ungetc: Fix backup buffer leak on program exit
@@ -61,6 +66,7 @@ The following bugs are resolved with this release:
   [32137] libio: Attempt wide backup free only for non-legacy code
   [32231] elf: Change ldconfig auxcache magic number
   [32470] x86: Avoid integer truncation with large cache sizes
 +  [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395)
 Version 2.38
 diff --git a/assert/assert.c b/assert/assert.c
 index b7c7a4a1ba..65a9fedf0d 100644
 --- a/assert/assert.c
 +++ b/assert/assert.c
@@ -18,6 +18,7 @@
 #include <assert.h>
 #include <atomic.h>
 #include <ldsodefs.h>
 +#include <libc-pointer-arith.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -64,7 +65,8 @@ __assert_fail_base (const char *fmt, const char *assertion, const char *file,
       (void) __fxprintf (NULL, "%s", str);
       (void) fflush (stderr);
 -      total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
 +      total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
 +			GLRO(dl_pagesize));
       struct abort_msg_s *buf = __mmap (NULL, total, PROT_READ | PROT_WRITE,
 					MAP_ANON | MAP_PRIVATE, -1, 0);
       if (__glibc_likely (buf != MAP_FAILED))
 diff --git a/sysdeps/posix/libc_fatal.c b/sysdeps/posix/libc_fatal.c
 index 70edcc10c1..5b9e4b7918 100644
 --- a/sysdeps/posix/libc_fatal.c
 +++ b/sysdeps/posix/libc_fatal.c
@@ -20,6 +20,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <ldsodefs.h>
 +#include <libc-pointer-arith.h>
 #include <paths.h>
 #include <stdarg.h>
 #include <stdbool.h>
@@ -123,7 +124,8 @@ __libc_message (const char *fmt, ...)
       WRITEV_FOR_FATAL (fd, iov, nlist, total);
 -      total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
 +      total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
 +			GLRO(dl_pagesize));
       struct abort_msg_s *buf = __mmap (NULL, total,
 					PROT_READ | PROT_WRITE,
 					MAP_ANON | MAP_PRIVATE, -1, 0);
 -- 
 2.27.0
--- a/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
+++ b/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
@ -0,0 +1,50 @@
 From 48642ef1a5721e0a7694d84fe46d83b6086dfe75 Mon Sep 17 00:00:00 2001
 From: Florian Weimer <fweimer@redhat.com>
 Date: Mon, 3 Jun 2024 10:49:40 +0200
 Subject: [PATCH] elf: Avoid some free (NULL) calls in
 _dl_update_slotinfo
 This has been confirmed to work around some interposed mallocs.  Here
 is a discussion of the impact test ust/libc-wrapper/test_libc-wrapper
 in lttng-tools:
  New TLS usage in libgcc_s.so.1, compatibility impact
  <https://inbox.sourceware.org/libc-alpha/8734v1ieke.fsf@oldenburg.str.redhat.com/>
 Reportedly, this patch also papers over a similar issue when tcmalloc
 2.9.1 is not compiled with -ftls-model=initial-exec.  Of course the
 goal really should be to compile mallocs with the initial-exec TLS
 model, but this commit appears to be a useful interim workaround.
 Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
 tls access after dlopen [BZ #19924]").
 Reviewed-by: Carlos O'Donell <carlos@redhat.com>
 (cherry picked from commit afe42e935b3ee97bac9a7064157587777259c60e)
 ---
 elf/dl-tls.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
 diff --git a/elf/dl-tls.c b/elf/dl-tls.c
 index 70446e71a8..de0168319c 100644
 --- a/elf/dl-tls.c
 +++ b/elf/dl-tls.c
@@ -819,7 +819,14 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 		 dtv entry free it.  Note: this is not AS-safe.  */
 	      /* XXX Ideally we will at some point create a memory
 		 pool.  */
 -	      free (dtv[modid].pointer.to_free);
 +	      /* Avoid calling free on a null pointer.  Some mallocs
 +		 incorrectly use dynamic TLS, and depending on how the
 +		 free function was compiled, it could call
 +		 __tls_get_addr before the null pointer check in the
 +		 free implementation.  Checking here papers over at
 +		 least some dynamic TLS usage by interposed mallocs.  */
 +	      if (dtv[modid].pointer.to_free != NULL)
 +		free (dtv[modid].pointer.to_free);
 	      dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
 	      dtv[modid].pointer.to_free = NULL;
 -- 
 2.27.0
--- a/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
+++ b/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
@ -0,0 +1,328 @@
 From 7772f9358c9a947251196ea7844b339f0a423ff6 Mon Sep 17 00:00:00 2001
 From: Szabolcs Nagy <szabolcs.nagy@arm.com>
 Date: Tue, 16 Feb 2021 12:55:13 +0000
 Subject: [PATCH] elf: Fix slow tls access after dlopen [BZ #19924]
 In short: __tls_get_addr checks the global generation counter and if
 the current dtv is older then _dl_update_slotinfo updates dtv up to the
 generation of the accessed module. So if the global generation is newer
 than generation of the module then __tls_get_addr keeps hitting the
 slow dtv update path. The dtv update path includes a number of checks
 to see if any update is needed and this already causes measurable tls
 access slow down after dlopen.
 It may be possible to detect up-to-date dtv faster.  But if there are
 many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at
 least walking the slotinfo list.
 This patch tries to update the dtv to the global generation instead, so
 after a dlopen the tls access slow path is only hit once.  The modules
 with larger generation than the accessed one were not necessarily
 synchronized before, so additional synchronization is needed.
 This patch uses acquire/release synchronization when accessing the
 generation counter.
 Note: in the x86_64 version of dl-tls.c the generation is only loaded
 once, since relaxed mo is not faster than acquire mo load.
 I have not benchmarked this. Tested by Adhemerval Zanella on aarch64,
 powerpc, sparc, x86 who reported that it fixes the performance issue
 of bug 19924.
 Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 (cherry picked from commit d2123d68275acc0f061e73d5f86ca504e0d5a344)
 ---
 elf/dl-close.c             |   2 +-
 elf/dl-open.c              |   8 +--
 elf/dl-reloc.c             |   6 +-
 elf/dl-tls.c               | 117 ++++++++++++++++++++-----------------
 sysdeps/generic/ldsodefs.h |   3 +-
 sysdeps/x86_64/dl-tls.c    |   4 +-
 6 files changed, 74 insertions(+), 66 deletions(-)
 diff --git a/elf/dl-close.c b/elf/dl-close.c
 index b887a44888..1c7a861db1 100644
 --- a/elf/dl-close.c
 +++ b/elf/dl-close.c
@@ -703,7 +703,7 @@ _dl_close_worker (struct link_map *map, bool force)
       if (__glibc_unlikely (newgen == 0))
 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
       /* Can be read concurrently.  */
 -      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
 +      atomic_store_release (&GL(dl_tls_generation), newgen);
       if (tls_free_end == GL(dl_tls_static_used))
 	GL(dl_tls_static_used) = tls_free_start;
 diff --git a/elf/dl-open.c b/elf/dl-open.c
 index 2d985e21d8..351931af04 100644
 --- a/elf/dl-open.c
 +++ b/elf/dl-open.c
@@ -405,7 +405,7 @@ update_tls_slotinfo (struct link_map *new)
     _dl_fatal_printf (N_("\
 TLS generation counter wrapped!  Please report this."));
   /* Can be read concurrently.  */
 -  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
 +  atomic_store_release (&GL(dl_tls_generation), newgen);
   /* We need a second pass for static tls data, because
      _dl_update_slotinfo must not be run while calls to
@@ -422,8 +422,8 @@ TLS generation counter wrapped!  Please report this."));
 	     now, but we can delay updating the DTV.  */
 	  imap->l_need_tls_init = 0;
 #ifdef SHARED
 -	  /* Update the slot information data for at least the
 -	     generation of the DSO we are allocating data for.  */
 +	  /* Update the slot information data for the current
 +	     generation.  */
 	  /* FIXME: This can terminate the process on memory
 	     allocation failure.  It is not possible to raise
@@ -431,7 +431,7 @@ TLS generation counter wrapped!  Please report this."));
 	     _dl_update_slotinfo would have to be split into two
 	     operations, similar to resize_scopes and update_scopes
 	     above.  This is related to bug 16134.  */
 -	  _dl_update_slotinfo (imap->l_tls_modid);
 +	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
 #endif
 	  dl_init_static_tls (imap);
 diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
 index 1d558c1e0c..e5c555d82c 100644
 --- a/elf/dl-reloc.c
 +++ b/elf/dl-reloc.c
@@ -112,11 +112,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional)
   if (map->l_real->l_relocated)
     {
 #ifdef SHARED
 +      /* Update the DTV of the current thread.  Note: GL(dl_load_tls_lock)
 +	 is held here so normal load of the generation counter is valid.  */
       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
 			    0))
 -	/* Update the slot information data for at least the generation of
 -	   the DSO we are allocating data for.  */
 -	(void) _dl_update_slotinfo (map->l_tls_modid);
 +	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
 #endif
       dl_init_static_tls (map);
 diff --git a/elf/dl-tls.c b/elf/dl-tls.c
 index 1f6f820819..70446e71a8 100644
 --- a/elf/dl-tls.c
 +++ b/elf/dl-tls.c
@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map)
 struct link_map *
 -_dl_update_slotinfo (unsigned long int req_modid)
 +_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 {
   struct link_map *the_map = NULL;
   dtv_t *dtv = THREAD_DTV ();
 -  /* The global dl_tls_dtv_slotinfo array contains for each module
 -     index the generation counter current when the entry was created.
 +  /* CONCURRENCY NOTES:
 +
 +     The global dl_tls_dtv_slotinfo_list array contains for each module
 +     index the generation counter current when that entry was updated.
      This array never shrinks so that all module indices which were
 -     valid at some time can be used to access it.  Before the first
 -     use of a new module index in this function the array was extended
 -     appropriately.  Access also does not have to be guarded against
 -     modifications of the array.  It is assumed that pointer-size
 -     values can be read atomically even in SMP environments.  It is
 -     possible that other threads at the same time dynamically load
 -     code and therefore add to the slotinfo list.  This is a problem
 -     since we must not pick up any information about incomplete work.
 -     The solution to this is to ignore all dtv slots which were
 -     created after the one we are currently interested.  We know that
 -     dynamic loading for this module is completed and this is the last
 -     load operation we know finished.  */
 -  unsigned long int idx = req_modid;
 +     valid at some time can be used to access it.  Concurrent loading
 +     and unloading of modules can update slotinfo entries or extend
 +     the array.  The updates happen under the GL(dl_load_tls_lock) and
 +     finish with the release store of the generation counter to
 +     GL(dl_tls_generation) which is synchronized with the load of
 +     new_gen in the caller.  So updates up to new_gen are synchronized
 +     but updates for later generations may not be.
 +
 +     Here we update the thread dtv from old_gen (== dtv[0].counter) to
 +     new_gen generation.  For this, each dtv[i] entry is either set to
 +     an unallocated state (set), or left unmodified (nop).  Where (set)
 +     may resize the dtv first if modid i >= dtv[-1].counter. The rules
 +     for the decision between (set) and (nop) are
 +
 +     (1) If slotinfo entry i is concurrently updated then either (set)
 +         or (nop) is valid: TLS access cannot use dtv[i] unless it is
 +         synchronized with a generation > new_gen.
 +
 +     Otherwise, if the generation of slotinfo entry i is gen and the
 +     loaded module for this entry is map then
 +
 +     (2) If gen <= old_gen then do (nop).
 +
 +     (3) If old_gen < gen <= new_gen then
 +         (3.1) if map != 0 then (set)
 +         (3.2) if map == 0 then either (set) or (nop).
 +
 +     Note that (1) cannot be reliably detected, but since both actions
 +     are valid it does not have to be.  Only (2) and (3.1) cases need
 +     to be distinguished for which relaxed mo access of gen and map is
 +     enough: their value is synchronized when it matters.
 +
 +     Note that a relaxed mo load may give an out-of-thin-air value since
 +     it is used in decisions that can affect concurrent stores.  But this
 +     should only happen if the OOTA value causes UB that justifies the
 +     concurrent store of the value.  This is not expected to be an issue
 +     in practice.  */
   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
 -  while (idx >= listp->len)
 +  if (dtv[0].counter < new_gen)
     {
 -      idx -= listp->len;
 -      listp = listp->next;
 -    }
 -
 -  if (dtv[0].counter < listp->slotinfo[idx].gen)
 -    {
 -      /* CONCURRENCY NOTES:
 -
 -	 Here the dtv needs to be updated to new_gen generation count.
 -
 -	 This code may be called during TLS access when GL(dl_load_tls_lock)
 -	 is not held.  In that case the user code has to synchronize with
 -	 dlopen and dlclose calls of relevant modules.  A module m is
 -	 relevant if the generation of m <= new_gen and dlclose of m is
 -	 synchronized: a memory access here happens after the dlopen and
 -	 before the dlclose of relevant modules.  The dtv entries for
 -	 relevant modules need to be updated, other entries can be
 -	 arbitrary.
 -
 -	 This e.g. means that the first part of the slotinfo list can be
 -	 accessed race free, but the tail may be concurrently extended.
 -	 Similarly relevant slotinfo entries can be read race free, but
 -	 other entries are racy.  However updating a non-relevant dtv
 -	 entry does not affect correctness.  For a relevant module m,
 -	 max_modid >= modid of m.  */
 -      size_t new_gen = listp->slotinfo[idx].gen;
       size_t total = 0;
       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
       assert (max_modid >= req_modid);
@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int req_modid)
 	    {
 	      size_t modid = total + cnt;
 -	      /* Later entries are not relevant.  */
 +	      /* Case (1) for all later modids.  */
 	      if (modid > max_modid)
 		break;
 	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
 +	      /* Case (1).  */
 	      if (gen > new_gen)
 -		/* Not relevant.  */
 		continue;
 -	      /* If the entry is older than the current dtv layout we
 -		 know we don't have to handle it.  */
 +	      /* Case (2) or (1).  */
 	      if (gen <= dtv[0].counter)
 		continue;
 +	      /* Case (3) or (1).  */
 +
 	      /* If there is no map this means the entry is empty.  */
 	      struct link_map *map
 		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
 	      /* Check whether the current dtv array is large enough.  */
 	      if (dtv[-1].counter < modid)
 		{
 +		  /* Case (3.2) or (1).  */
 		  if (map == NULL)
 		    continue;
 -		  /* Resize the dtv.  */
 +		  /* Resizing the dtv aborts on failure: bug 16134.  */
 		  dtv = _dl_resize_dtv (dtv, max_modid);
 		  assert (modid <= dtv[-1].counter);
@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int req_modid)
 		}
 	      /* If there is currently memory allocate for this
 -		 dtv entry free it.  */
 +		 dtv entry free it.  Note: this is not AS-safe.  */
 	      /* XXX Ideally we will at some point create a memory
 		 pool.  */
 	      free (dtv[modid].pointer.to_free);
@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
 static struct link_map *
 __attribute_noinline__
 -update_get_addr (GET_ADDR_ARGS)
 +update_get_addr (GET_ADDR_ARGS, size_t gen)
 {
 -  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
 +  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
   dtv_t *dtv = THREAD_DTV ();
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS)
   dtv_t *dtv = THREAD_DTV ();
   /* Update is needed if dtv[0].counter < the generation of the accessed
 -     module.  The global generation counter is used here as it is easier
 -     to check.  Synchronization for the relaxed MO access is guaranteed
 -     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
 +     module, but the global generation counter is easier to check (which
 +     must be synchronized up to the generation of the accessed module by
 +     user code doing the TLS access so relaxed mo read is enough).  */
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
 -    return update_get_addr (GET_ADDR_PARAM);
 +    {
 +      /* Update DTV up to the global generation, see CONCURRENCY NOTES
 +         in _dl_update_slotinfo.  */
 +      gen = atomic_load_acquire (&GL(dl_tls_generation));
 +      return update_get_addr (GET_ADDR_PARAM, gen);
 +    }
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
 diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
 index e8b7359b04..ed69c6babd 100644
 --- a/sysdeps/generic/ldsodefs.h
 +++ b/sysdeps/generic/ldsodefs.h
@@ -1251,7 +1251,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
 /* Update slot information data for at least the generation of the
    module with the given index.  */
 -extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
 +extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
 +					     size_t gen)
      attribute_hidden;
 /* Look up the module's TLS block as for __tls_get_addr,
 diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
 index 7a7fe38625..e9b6ab9970 100644
 --- a/sysdeps/x86_64/dl-tls.c
 +++ b/sysdeps/x86_64/dl-tls.c
@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
 {
   dtv_t *dtv = THREAD_DTV ();
 -  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
 +  size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
 -    return update_get_addr (GET_ADDR_PARAM);
 +    return update_get_addr (GET_ADDR_PARAM, gen);
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
 }
 -- 
 2.27.0
--- a/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
+++ b/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
@ -0,0 +1,521 @@
 From 549e7f7c5a94f5ccbab2ad5f1babca05028a31c7 Mon Sep 17 00:00:00 2001
 From: Florian Weimer <fweimer@redhat.com>
 Date: Mon, 1 Jul 2024 17:42:04 +0200
 Subject: [PATCH] elf: Support recursive use of dynamic TLS in interposed
 malloc
 It turns out that quite a few applications use bundled mallocs that
 have been built to use global-dynamic TLS (instead of the recommended
 initial-exec TLS).  The previous workaround from
 commit afe42e935b3ee97bac9a7064157587777259c60e ("elf: Avoid some
 free (NULL) calls in _dl_update_slotinfo") does not fix all
 encountered cases unfortunatelly.
 This change avoids the TLS generation update for recursive use
 of TLS from a malloc that was called during a TLS update.  This
 is possible because an interposed malloc has a fixed module ID and
 TLS slot.  (It cannot be unloaded.)  If an initially-loaded module ID
 is encountered in __tls_get_addr and the dynamic linker is already
 in the middle of a TLS update, use the outdated DTV, thus avoiding
 another call into malloc.  It's still necessary to update the
 DTV to the most recent generation, to get out of the slow path,
 which is why the check for recursion is needed.
 The bookkeeping is done using a global counter instead of per-thread
 flag because TLS access in the dynamic linker is tricky.
 All this will go away once the dynamic linker stops using malloc
 for TLS, likely as part of a change that pre-allocates all TLS
 during pthread_create/dlopen.
 Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
 tls access after dlopen [BZ #19924]").
 Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
 (cherry picked from commit 018f0fc3b818d4d1460a4e2384c24802504b1d20)
 Conflict: adapt file "elf/Makefile" for patch "elf: Switch to main
 malloc after final ld.so self-relocation"
 ---
 elf/Makefile                     | 26 +++++++++
 elf/dl-tls.c                     | 95 +++++++++++++++++++++++++++++---
 elf/rtld.c                       |  2 +
 elf/tst-recursive-tls.c          | 60 ++++++++++++++++++++
 elf/tst-recursive-tlsmallocmod.c | 64 +++++++++++++++++++++
 elf/tst-recursive-tlsmodN.c      | 28 ++++++++++
 sysdeps/generic/ldsodefs.h       | 14 +++++
 sysdeps/x86_64/dl-tls.c          |  5 +-
 8 files changed, 284 insertions(+), 10 deletions(-)
 create mode 100644 elf/tst-recursive-tls.c
 create mode 100644 elf/tst-recursive-tlsmallocmod.c
 create mode 100644 elf/tst-recursive-tlsmodN.c
 diff --git a/elf/Makefile b/elf/Makefile
 index ea98cba8..391f29e9 100644
 --- a/elf/Makefile
 +++ b/elf/Makefile
@@ -433,6 +433,7 @@ tests += \
   tst-p_align1 \
   tst-p_align2 \
   tst-p_align3 \
 +  tst-recursive-tls \
   tst-relsort1 \
   tst-ro-dynamic \
   tst-rtld-no-malloc \
@@ -865,6 +866,23 @@ modules-names += \
   tst-null-argv-lib \
   tst-p_alignmod-base \
   tst-p_alignmod3 \
 +  tst-recursive-tlsmallocmod \
 +  tst-recursive-tlsmod0 \
 +  tst-recursive-tlsmod1 \
 +  tst-recursive-tlsmod2 \
 +  tst-recursive-tlsmod3 \
 +  tst-recursive-tlsmod4 \
 +  tst-recursive-tlsmod5 \
 +  tst-recursive-tlsmod6 \
 +  tst-recursive-tlsmod7 \
 +  tst-recursive-tlsmod8 \
 +  tst-recursive-tlsmod9 \
 +  tst-recursive-tlsmod10 \
 +  tst-recursive-tlsmod11 \
 +  tst-recursive-tlsmod12 \
 +  tst-recursive-tlsmod13 \
 +  tst-recursive-tlsmod14 \
 +  tst-recursive-tlsmod15 \
   tst-relsort1mod1 \
   tst-relsort1mod2 \
   tst-ro-dynamic-mod \
@@ -3042,6 +3060,14 @@ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
 endif
 +$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so
 +# More objects than DTV_SURPLUS, to trigger DTV reallocation.
 +$(objpfx)tst-recursive-tls.out: \
 +  $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \
 +    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
 +$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c
 +	$(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$*
 +
 # Reuse an audit module which provides ample debug logging.
 tst-rtld-no-malloc-audit-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
 diff --git a/elf/dl-tls.c b/elf/dl-tls.c
 index de016831..59d4021e 100644
 --- a/elf/dl-tls.c
 +++ b/elf/dl-tls.c
@@ -75,6 +75,31 @@
 /* Default for dl_tls_static_optional.  */
 #define OPTIONAL_TLS 512
 +/* Used to count the number of threads currently executing dynamic TLS
 +   updates.  Used to avoid recursive malloc calls in __tls_get_addr
 +   for an interposed malloc that uses global-dynamic TLS (which is not
 +   recommended); see _dl_tls_allocate_active checks.  This could be a
 +   per-thread flag, but would need TLS access in the dynamic linker.  */
 +unsigned int _dl_tls_threads_in_update;
 +
 +static inline void
 +_dl_tls_allocate_begin (void)
 +{
 +  atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1);
 +}
 +
 +static inline void
 +_dl_tls_allocate_end (void)
 +{
 +  atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1);
 +}
 +
 +static inline bool
 +_dl_tls_allocate_active (void)
 +{
 +  return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0;
 +}
 +
 /* Compute the static TLS surplus based on the namespace count and the
    TLS space that can be used for optimizations.  */
 static inline int
@@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void)
   size += TLS_PRE_TCB_SIZE;
 #endif
 -  /* Perform the allocation.  Reserve space for the required alignment
 -     and the pointer to the original allocation.  */
 +  /* Reserve space for the required alignment and the pointer to the
 +     original allocation.  */
   size_t alignment = GLRO (dl_tls_static_align);
 +
 +  /* Perform the allocation.  */
 +  _dl_tls_allocate_begin ();
   void *allocated = malloc (size + alignment + sizeof (void *));
   if (__glibc_unlikely (allocated == NULL))
 -    return NULL;
 +    {
 +      _dl_tls_allocate_end ();
 +      return NULL;
 +    }
   /* Perform alignment and allocate the DTV.  */
 #if TLS_TCB_AT_TP
@@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void)
   result = allocate_dtv (result);
   if (result == NULL)
     free (allocated);
 +
 +  _dl_tls_allocate_end ();
   return result;
 }
@@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
   size_t newsize = max_modid + DTV_SURPLUS;
   size_t oldsize = dtv[-1].counter;
 +  _dl_tls_allocate_begin ();
   if (dtv == GL(dl_initial_dtv))
     {
       /* This is the initial dtv that was either statically allocated in
@@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
       if (newp == NULL)
 	oom ();
     }
 +  _dl_tls_allocate_end ();
   newp[0].counter = newsize;
@@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size)
   if (powerof2 (alignment) && alignment <= _Alignof (max_align_t))
     {
       /* The alignment is supported by malloc.  */
 +      _dl_tls_allocate_begin ();
       void *ptr = malloc (size);
 +      _dl_tls_allocate_end ();
       return (struct dtv_pointer) { ptr, ptr };
     }
@@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size)
   /* Perform the allocation.  This is the pointer we need to free
      later.  */
 +  _dl_tls_allocate_begin ();
   void *start = malloc (alloc_size);
 +  _dl_tls_allocate_end ();
 +
   if (start == NULL)
     return (struct dtv_pointer) {};
@@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
 		 free implementation.  Checking here papers over at
 		 least some dynamic TLS usage by interposed mallocs.  */
 	      if (dtv[modid].pointer.to_free != NULL)
 -		free (dtv[modid].pointer.to_free);
 +		{
 +		  _dl_tls_allocate_begin ();
 +		  free (dtv[modid].pointer.to_free);
 +		  _dl_tls_allocate_end ();
 +		}
 	      dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
 	      dtv[modid].pointer.to_free = NULL;
@@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS)
   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
   if (__glibc_unlikely (dtv[0].counter != gen))
     {
 -      /* Update DTV up to the global generation, see CONCURRENCY NOTES
 -         in _dl_update_slotinfo.  */
 -      gen = atomic_load_acquire (&GL(dl_tls_generation));
 -      return update_get_addr (GET_ADDR_PARAM, gen);
 +      if (_dl_tls_allocate_active ()
 +	  && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)
 +	  /* This is a reentrant __tls_get_addr call, but we can
 +	     satisfy it because it's an initially-loaded module ID.
 +	     These TLS slotinfo slots do not change, so the
 +	     out-of-date generation counter does not matter.  However,
 +	     if not in a TLS update, still update_get_addr below, to
 +	     get off the slow path eventually.  */
 +	;
 +      else
 +	{
 +	  /* Update DTV up to the global generation, see CONCURRENCY NOTES
 +	     in _dl_update_slotinfo.  */
 +	  gen = atomic_load_acquire (&GL(dl_tls_generation));
 +	  return update_get_addr (GET_ADDR_PARAM, gen);
 +	}
     }
   void *p = dtv[GET_ADDR_MODULE].pointer.val;
@@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS)
   return (char *) p + GET_ADDR_OFFSET;
 }
 -#endif
 +#endif /* SHARED */
 /* Look up the module's TLS block as for __tls_get_addr,
@@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l)
   return data;
 }
 +size_t _dl_tls_initial_modid_limit;
 +
 +void
 +_dl_tls_initial_modid_limit_setup (void)
 +{
 +  struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
 +  size_t idx;
 +  for (idx = 0; idx < listp->len; ++idx)
 +    {
 +      struct link_map *l = listp->slotinfo[idx].map;
 +      if (l == NULL
 +	  /* The object can be unloaded, so its modid can be
 +	     reassociated.  */
 +	  || !(l->l_type == lt_executable || l->l_type == lt_library))
 +	break;
 +    }
 +  _dl_tls_initial_modid_limit = idx;
 +}
 +
 void
 _dl_add_to_slotinfo (struct link_map *l, bool do_add)
@@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
 	 the first slot.  */
       assert (idx == 0);
 +      _dl_tls_allocate_begin ();
       listp = (struct dtv_slotinfo_list *)
 	malloc (sizeof (struct dtv_slotinfo_list)
 		+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
 +      _dl_tls_allocate_end ();
       if (listp == NULL)
 	{
 	  /* We ran out of memory while resizing the dtv slotinfo list.  */
 diff --git a/elf/rtld.c b/elf/rtld.c
 index 558733b8..0a1e202c 100644
 --- a/elf/rtld.c
 +++ b/elf/rtld.c
@@ -789,6 +789,8 @@ init_tls (size_t naudit)
     _dl_fatal_printf ("\
 cannot allocate TLS data structures for initial thread\n");
 +  _dl_tls_initial_modid_limit_setup ();
 +
   /* Store for detection of the special case by __tls_get_addr
      so it knows not to pass this dtv to the normal realloc.  */
   GL(dl_initial_dtv) = GET_DTV (tcbp);
 diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c
 new file mode 100644
 index 00000000..716d1f78
 --- /dev/null
 +++ b/elf/tst-recursive-tls.c
@@ -0,0 +1,60 @@
 +/* Test with interposed malloc with dynamic TLS.
 +   Copyright (C) 2024 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <array_length.h>
 +#include <stdio.h>
 +#include <support/check.h>
 +#include <support/xdlfcn.h>
 +
 +/* Defined in tst-recursive-tlsmallocmod.so.  */
 +extern __thread unsigned int malloc_subsytem_counter;
 +
 +static int
 +do_test (void)
 +{
 +  /* 16 is large enough to exercise the DTV resizing case.  */
 +  void *handles[16];
 +
 +  for (unsigned int i = 0; i < array_length (handles); ++i)
 +    {
 +      /* Re-use the TLS slot for module 0.  */
 +      if (i > 0)
 +        xdlclose (handles[0]);
 +
 +      char soname[30];
 +      snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i);
 +      handles[i] = xdlopen (soname, RTLD_NOW);
 +
 +      if (i > 0)
 +        {
 +          handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW);
 +          int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0");
 +          /* May trigger TLS storage allocation using malloc.  */
 +          TEST_COMPARE (fptr (), 0);
 +        }
 +    }
 +
 +  for (unsigned int i = 0; i < array_length (handles); ++i)
 +    xdlclose (handles[i]);
 +
 +  printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter);
 +  TEST_VERIFY (malloc_subsytem_counter > 0);
 +  return 0;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c
 new file mode 100644
 index 00000000..c24e9945
 --- /dev/null
 +++ b/elf/tst-recursive-tlsmallocmod.c
@@ -0,0 +1,64 @@
 +/* Interposed malloc with dynamic TLS.
 +   Copyright (C) 2024 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <stdlib.h>
 +#include <dlfcn.h>
 +
 +__thread unsigned int malloc_subsytem_counter;
 +
 +static __typeof (malloc) *malloc_fptr;
 +static __typeof (free) *free_fptr;
 +static __typeof (calloc) *calloc_fptr;
 +static __typeof (realloc) *realloc_fptr;
 +
 +static void __attribute__ ((constructor))
 +init (void)
 +{
 +  malloc_fptr = dlsym (RTLD_NEXT, "malloc");
 +  free_fptr = dlsym (RTLD_NEXT, "free");
 +  calloc_fptr = dlsym (RTLD_NEXT, "calloc");
 +  realloc_fptr = dlsym (RTLD_NEXT, "realloc");
 +}
 +
 +void *
 +malloc (size_t size)
 +{
 +  ++malloc_subsytem_counter;
 +  return malloc_fptr (size);
 +}
 +
 +void
 +free (void *ptr)
 +{
 +  ++malloc_subsytem_counter;
 +  return free_fptr (ptr);
 +}
 +
 +void *
 +calloc (size_t a, size_t b)
 +{
 +  ++malloc_subsytem_counter;
 +  return calloc_fptr (a, b);
 +}
 +
 +void *
 +realloc (void *ptr, size_t size)
 +{
 +  ++malloc_subsytem_counter;
 +  return realloc_fptr (ptr, size);
 +}
 diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c
 new file mode 100644
 index 00000000..bb7592ae
 --- /dev/null
 +++ b/elf/tst-recursive-tlsmodN.c
@@ -0,0 +1,28 @@
 +/* Test module with global-dynamic TLS.  Used to trigger DTV reallocation.
 +   Copyright (C) 2024 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +/* Compiled with VAR and FUNC set via -D.  FUNC requires some
 +   relocation against TLS variable VAR.  */
 +
 +__thread int VAR;
 +
 +int
 +FUNC (void)
 +{
 +  return VAR;
 +}
 diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
 index 22fbbecd..ad271ae0 100644
 --- a/sysdeps/generic/ldsodefs.h
 +++ b/sysdeps/generic/ldsodefs.h
@@ -1262,6 +1262,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
 					     size_t gen)
      attribute_hidden;
 +/* The last TLS module ID that is initially loaded, plus 1.  TLS
 +   addresses for modules with IDs lower than that can be obtained from
 +   the DTV even if its generation is outdated.  */
 +extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro;
 +
 +/* Compute _dl_tls_initial_modid_limit.  To be called after initial
 +   relocation.  */
 +void _dl_tls_initial_modid_limit_setup (void) attribute_hidden;
 +
 +/* Number of threads currently in a TLS update.  This is used to
 +   detect reentrant __tls_get_addr calls without a per-thread
 +   flag.  */
 +extern unsigned int _dl_tls_threads_in_update attribute_hidden;
 +
 /* Look up the module's TLS block as for __tls_get_addr,
    but never touch anything.  Return null if it's not allocated yet.  */
 extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden;
 diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
 index e9b6ab99..c484f39e 100644
 --- a/sysdeps/x86_64/dl-tls.c
 +++ b/sysdeps/x86_64/dl-tls.c
@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
   dtv_t *dtv = THREAD_DTV ();
   size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
 -  if (__glibc_unlikely (dtv[0].counter != gen))
 +  if (__glibc_unlikely (dtv[0].counter != gen)
 +      /* See comment in __tls_get_addr in elf/dl-tls.c.  */
 +      && !(_dl_tls_allocate_active ()
 +           && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
     return update_get_addr (GET_ADDR_PARAM, gen);
   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
 -- 
 2.27.0
--- a/glibc.spec
+++ b/glibc.spec
@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	51
+Release: 	52
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@ -252,6 +252,21 @@ Patch162: nptl-initialize-rseq-area-prior-to-registration.patch
 Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch
 Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch
 Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch
 Patch166: x86_64-Sort-fpu-multiarch-Makefile.patch
 Patch167: x86_64-Add-log2-with-FMA.patch
 Patch168: x86_64-Add-expm1-with-FMA.patch
 Patch169: x86_64-Add-log1p-with-FMA.patch
 Patch170: x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
 Patch171: elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
 Patch172: x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
 Patch173: sysdeps-x86-Makefile-Split-and-sort-tests.patch
 Patch174: x86_64-Fix-missing-wcsncat-function-definition-witho.patch
 Patch175: x86-Improve-large-memset-perf-with-non-temporal-stor.patch
 Patch176: x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
 Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
 Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
 Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
 Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
 #openEuler patch list
 Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@ -1471,6 +1486,23 @@ fi
 %endif
 %changelog
 * Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
 - stdlib: Test using setenv with updated environ [BZ #32588]
 - Fix underallocation of abort_msg_s struct (CVE-2025-0395)
 - elf: Support recursive use of dynamic TLS in interposed malloc
 - elf: Avoid some free (NULL) calls in _dl_update_slotinfo
 - x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212]
 - x86: Improve large memset perf with non-temporal stores [RHEL-29312]
 - x86_64: Fix missing wcsncat function definition without multiarch (x86-64-v4)
 - sysdeps/x86/Makefile: Split and sort tests
 - x86: Only align destination to 1x VEC_SIZE in memset 4x loop
 - elf: Fix slow tls access after dlopen [BZ #19924]
 - x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
 - x86_64: Add log1p with FMA
 - x86_64: Add expm1 with FMA
 - x86_64: Add log2 with FMA
 - x86_64: Sort fpu/multiarch/Makefile
 * Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51
 - x86: Set preferred CPU features and default NT threshold for Zhaoxin processors
--- a/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
+++ b/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
@ -0,0 +1,75 @@
 From 650a0aaaffa9ddb44732fa6156b31c5f30ee596f Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 24 Jan 2025 18:53:13 +0800
 Subject: [PATCH] stdlib: Test using setenv with updated environ [BZ
 #32588]
 Add a test for setenv with updated environ.  Verify that BZ #32588 is
 fixed.
 Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
 Reviewed-by: Florian Weimer <fweimer@redhat.com>
 (cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
 ---
 stdlib/Makefile             |  1 +
 stdlib/tst-setenv-environ.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 stdlib/tst-setenv-environ.c
 diff --git a/stdlib/Makefile b/stdlib/Makefile
 index 25e42a77e7..750810ee92 100644
 --- a/stdlib/Makefile
 +++ b/stdlib/Makefile
@@ -232,6 +232,7 @@ tests := \
   tst-setcontext7 \
   tst-setcontext8 \
   tst-setcontext9 \
 +  tst-setenv-environ \
   tst-strfmon_l \
   tst-strfrom \
   tst-strfrom-locale \
 diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
 new file mode 100644
 index 0000000000..02fcef96d0
 --- /dev/null
 +++ b/stdlib/tst-setenv-environ.c
@@ -0,0 +1,36 @@
 +/* Test using setenv with updated environ.
 +   Copyright (C) 2025 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <stdlib.h>
 +#include <support/check.h>
 +
 +extern char **environ;
 +
 +int
 +do_test (void)
 +{
 +  char *valp;
 +  static char *dummy_environ[] = { NULL };
 +  environ = dummy_environ;
 +  setenv ("A", "1", 0);
 +  valp = getenv ("A");
 +  TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
 +  return 0;
 +}
 +
 +#include <support/test-driver.c>
 -- 
 2.27.0
--- a/sysdeps-x86-Makefile-Split-and-sort-tests.patch
+++ b/sysdeps-x86-Makefile-Split-and-sort-tests.patch
@ -0,0 +1,178 @@
 From 0d14bf0754ee8d8cf2bf3dad298fa5c5f97537db Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Thu, 7 Dec 2023 09:00:11 -0800
 Subject: [PATCH] sysdeps/x86/Makefile: Split and sort tests
 Put each test on a separate line and sort tests.
 (cherry picked from commit 7e03e0de7e7c2de975b5c5e18f5a4b0c75816674)
 ---
 sysdeps/x86/Makefile | 110 ++++++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 32 deletions(-)
 diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
 index 917c26f116..5631a59a26 100644
 --- a/sysdeps/x86/Makefile
 +++ b/sysdeps/x86/Makefile
@@ -10,36 +10,51 @@ sysdep_headers += sys/platform/x86.h bits/platform/x86.h
 CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags)
 CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector)
 -tests += tst-get-cpu-features tst-get-cpu-features-static \
 -	 tst-cpu-features-cpuinfo tst-cpu-features-cpuinfo-static \
 -	 tst-cpu-features-supports tst-cpu-features-supports-static
 -tests-static += tst-get-cpu-features-static \
 -		tst-cpu-features-cpuinfo-static \
 -		tst-cpu-features-supports-static
 +tests += \
 +  tst-get-cpu-features \
 +  tst-get-cpu-features-static \
 +  tst-cpu-features-cpuinfo \
 +  tst-cpu-features-cpuinfo-static \
 +  tst-cpu-features-supports \
 +  tst-cpu-features-supports-static \
 +# tests
 +tests-static += \
 +  tst-get-cpu-features-static \
 +  tst-cpu-features-cpuinfo-static \
 +  tst-cpu-features-supports-static \
 +# tests-static
 ifeq (yes,$(have-ifunc))
 ifeq (yes,$(have-gcc-ifunc))
 tests += \
   tst-ifunc-isa-1 \
 -  tst-ifunc-isa-1-static
 +  tst-ifunc-isa-1-static \
 +# tests
 tests-static += \
 -  tst-ifunc-isa-1-static
 +  tst-ifunc-isa-1-static \
 +# tests-static
 test-xfail-tst-ifunc-isa-1 = $(with-lld)
 test-xfail-tst-ifunc-isa-1-static = $(with-lld)
 tests += \
   tst-ifunc-isa-2 \
 -  tst-ifunc-isa-2-static
 +  tst-ifunc-isa-2-static \
 +# tests
 tests-static += \
 -  tst-ifunc-isa-2-static
 +  tst-ifunc-isa-2-static \
 +# tests-static
 test-xfail-tst-ifunc-isa-2 = $(with-lld)
 test-xfail-tst-ifunc-isa-2-static = $(with-lld)
 endif
 endif
 ifeq (yes,$(enable-x86-isa-level))
 -tests += tst-isa-level-1
 -modules-names += tst-isa-level-mod-1-baseline \
 -		 tst-isa-level-mod-1-v2 \
 -		 tst-isa-level-mod-1-v3 \
 -		 tst-isa-level-mod-1-v4 \
 +tests += \
 +  tst-isa-level-1 \
 +# tests
 +modules-names += \
 +  tst-isa-level-mod-1-baseline \
 +  tst-isa-level-mod-1-v2 \
 +  tst-isa-level-mod-1-v3 \
 +  tst-isa-level-mod-1-v4 \
 +# modules-names
 # X86 ISA level baseline
 CFLAGS-tst-isa-level-mod-1-baseline.c += -DINCLUDE_X86_ISA_LEVEL \
@@ -68,14 +83,18 @@ tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
 endif
 ifeq ($(subdir),math)
 -tests += tst-ldbl-nonnormal-printf
 +tests += \
 + tst-ldbl-nonnormal-printf \
 +# tests
 endif # $(subdir) == math
 ifeq ($(subdir),setjmp)
 gen-as-const-headers += jmp_buf-ssp.sym
 sysdep_routines += __longjmp_cancel
 ifneq ($(enable-cet),no)
 -tests += tst-setjmp-cet
 +tests += \
 +  tst-setjmp-cet \
 +# tests
 tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
 endif
 endif
@@ -122,20 +141,45 @@ ifneq ($(enable-cet),no)
 ifeq ($(subdir),elf)
 sysdep-dl-routines += dl-cet
 -tests += tst-cet-legacy-1 tst-cet-legacy-1a tst-cet-legacy-2 \
 -	 tst-cet-legacy-2a tst-cet-legacy-3 tst-cet-legacy-4 \
 -	 tst-cet-legacy-5a tst-cet-legacy-6a tst-cet-legacy-7 \
 -	 tst-cet-legacy-8 tst-cet-legacy-9 tst-cet-legacy-9-static \
 -	 tst-cet-legacy-10 tst-cet-legacy-10-static
 -tests-static += tst-cet-legacy-9-static tst-cet-legacy-10-static
 +tests += \
 +  tst-cet-legacy-1 \
 +  tst-cet-legacy-1a \
 +  tst-cet-legacy-2 \
 +  tst-cet-legacy-2a \
 +  tst-cet-legacy-3 \
 +  tst-cet-legacy-4 \
 +  tst-cet-legacy-5a \
 +  tst-cet-legacy-6a \
 +  tst-cet-legacy-7 \
 +  tst-cet-legacy-8 \
 +  tst-cet-legacy-9 \
 +  tst-cet-legacy-9-static \
 +  tst-cet-legacy-10 \
 +  tst-cet-legacy-10-static \
 +# tests
 +tests-static += \
 +  tst-cet-legacy-9-static \
 +  tst-cet-legacy-10-static \
 +# tests-static
 tst-cet-legacy-1a-ARGS = -- $(host-test-program-cmd)
 -tests += tst-cet-legacy-4a tst-cet-legacy-4b tst-cet-legacy-4c \
 -	 tst-cet-legacy-5b tst-cet-legacy-6b
 -modules-names += tst-cet-legacy-mod-1 tst-cet-legacy-mod-2 \
 -		 tst-cet-legacy-mod-4 tst-cet-legacy-mod-5a \
 -		 tst-cet-legacy-mod-5b tst-cet-legacy-mod-5c \
 -		 tst-cet-legacy-mod-6a tst-cet-legacy-mod-6b \
 -		 tst-cet-legacy-mod-6c
 +tests += \
 +  tst-cet-legacy-4a \
 +  tst-cet-legacy-4b \
 +  tst-cet-legacy-4c \
 +  tst-cet-legacy-5b \
 +  tst-cet-legacy-6b \
 +# tests
 +modules-names += \
 +  tst-cet-legacy-mod-1 \
 +  tst-cet-legacy-mod-2 \
 +  tst-cet-legacy-mod-4 \
 +  tst-cet-legacy-mod-5a \
 +  tst-cet-legacy-mod-5b \
 +  tst-cet-legacy-mod-5c \
 +  tst-cet-legacy-mod-6a \
 +  tst-cet-legacy-mod-6b \
 +  tst-cet-legacy-mod-6c \
 +# modules-names
 CFLAGS-tst-cet-legacy-2.c += -fcf-protection=branch
 CFLAGS-tst-cet-legacy-2a.c += -fcf-protection
@@ -243,7 +287,9 @@ endif
 ifeq ($(subdir),posix)
 tests += \
   tst-sysconf-cache-linesize \
 -  tst-sysconf-cache-linesize-static
 +  tst-sysconf-cache-linesize-static \
 +# tests
 tests-static += \
 -  tst-sysconf-cache-linesize-static
 +  tst-sysconf-cache-linesize-static \
 +# tests-static
 endif
 -- 
 2.27.0
--- a/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
+++ b/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
@ -0,0 +1,77 @@
 From 58822f954f6284c8687dfff43fa4e9e349eeccad Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 28 Aug 2023 12:08:14 -0700
 Subject: [PATCH] x86: Check the lower byte of EAX of CPUID leaf 2 [BZ
 #30643]
 The old Intel software developer manual specified that the low byte of
 EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
 CPUDID leaf 2 was needed to retrieve the complete cache information. The
 newer Intel manual has been changed to that it should always return 1
 and be ignored.  If the lower byte isn't 1, CPUID leaf 2 can't be used.
 In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead.  If
 CPUID leaf 4 doesn't contain the cache information, cache information
 isn't available at all.  This addresses BZ #30643.
 (cherry picked from commit 1493622f4f9048ffede3fbedb64695efa49d662a)
 ---
 sysdeps/x86/dl-cacheinfo.h | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)
 diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
 index 6c7740422a..400d15f208 100644
 --- a/sysdeps/x86/dl-cacheinfo.h
 +++ b/sysdeps/x86/dl-cacheinfo.h
@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      ++round;
 	    }
 	  /* There is no other cache information anywhere else.  */
 -	  break;
 +	  return -1;
 	}
       else
 	{
@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
   /* OK, we can use the CPUID instruction to get all info about the
      caches.  */
 -  unsigned int cnt = 0;
 -  unsigned int max = 1;
   long int result = 0;
   bool no_level_2_or_3 = false;
   bool has_level_2 = false;
 +  unsigned int eax;
 +  unsigned int ebx;
 +  unsigned int ecx;
 +  unsigned int edx;
 +  __cpuid (2, eax, ebx, ecx, edx);
 -  while (cnt++ < max)
 +  /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
 +     should be ignored.  If it isn't 1, use CPUID leaf 4 instead.  */
 +  if ((eax & 0xff) != 1)
 +    return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
 +			     cpu_features);
 +  else
     {
 -      unsigned int eax;
 -      unsigned int ebx;
 -      unsigned int ecx;
 -      unsigned int edx;
 -      __cpuid (2, eax, ebx, ecx, edx);
 -
 -      /* The low byte of EAX in the first round contain the number of
 -	 rounds we have to make.  At least one, the one we are already
 -	 doing.  */
 -      if (cnt == 1)
 -	{
 -	  max = eax & 0xff;
 -	  eax &= 0xffffff00;
 -	}
 +      eax &= 0xffffff00;
       /* Process the individual registers' value.  */
       result = intel_check_word (name, eax, &has_level_2,
 -- 
 2.27.0
--- a/x86-Improve-large-memset-perf-with-non-temporal-stor.patch
+++ b/x86-Improve-large-memset-perf-with-non-temporal-stor.patch
@ -0,0 +1,254 @@
 From 04b8d484323b2ff18b3422c4b883ef4cb6281c53 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 24 May 2024 12:38:50 -0500
 Subject: [PATCH] x86: Improve large memset perf with non-temporal stores
 [RHEL-29312]
 Previously we use `rep stosb` for all medium/large memsets. This is
 notably worse than non-temporal stores for large (above a
 few MBs) memsets.
 See:
 https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
 For data using different stategies for large memset on ICX and SKX.
 Using non-temporal stores can be up to 3x faster on ICX and 2x faster
 on SKX. Historically, these numbers would not have been so good
 because of the zero-over-zero writeback optimization that `rep stosb`
 is able to do. But, the zero-over-zero writeback optimization has been
 removed as a potential side-channel attack, so there is no longer any
 good reason to only rely on `rep stosb` for large memsets. On the flip
 size, non-temporal writes can avoid data in their RFO requests saving
 memory bandwidth.
 All of the other changes to the file are to re-organize the
 code-blocks to maintain "good" alignment given the new code added in
 the `L(stosb_local)` case.
 The results from running the GLIBC memset benchmarks on TGL-client for
 N=20 runs:
 Geometric Mean across the suite New / Old EXEX256: 0.979
 Geometric Mean across the suite New / Old EXEX512: 0.979
 Geometric Mean across the suite New / Old AVX2   : 0.986
 Geometric Mean across the suite New / Old SSE2   : 0.979
 Most of the cases are essentially unchanged, this is mostly to show
 that adding the non-temporal case didn't add any regressions to the
 other cases.
 The results on the memset-large benchmark suite on TGL-client for N=20
 runs:
 Geometric Mean across the suite New / Old EXEX256: 0.926
 Geometric Mean across the suite New / Old EXEX512: 0.925
 Geometric Mean across the suite New / Old AVX2   : 0.928
 Geometric Mean across the suite New / Old SSE2   : 0.924
 So roughly a 7.5% speedup. This is lower than what we see on servers
 (likely because clients typically have faster single-core bandwidth so
 saving bandwidth on RFOs is less impactful), but still advantageous.
 Full test-suite passes on x86_64 w/ and w/o multiarch.
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 (cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f)
 ---
 .../multiarch/memset-vec-unaligned-erms.S     | 147 +++++++++++-------
 1 file changed, 91 insertions(+), 56 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index 0f0636b90f..aba45e3da0 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,8 +21,13 @@
    2. If size is less than VEC, use integer register stores.
    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
 -   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
 -      4 VEC stores and store 4 * VEC at a time until done.  */
 +   5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
 +      4 VEC stores and store 4 * VEC at a time until done.
 +   6. On machines ERMS feature, if size is range
 +	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
 +	  then REP STOSB will be used.
 +   7. If size >= __x86_shared_non_temporal_threshold, use a
 +	  non-temporal stores.  */
 #include <sysdep.h>
@@ -145,6 +150,41 @@ L(entry_from_wmemset):
 	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VMM(0), (%rdi)
 	VZEROUPPER_RETURN
 +
 +	/* If have AVX512 mask instructions put L(less_vec) close to
 +	   entry as it doesn't take much space and is likely a hot target.  */
 +#ifdef USE_LESS_VEC_MASK_STORE
 +    /* Align to ensure the L(less_vec) logic all fits in 1x cache lines.  */
 +	.p2align 6,, 47
 +	.p2align 4
 +L(less_vec):
 +L(less_vec_from_wmemset):
 +	/* Less than 1 VEC.  */
 +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 +#  error Unsupported VEC_SIZE!
 +# endif
 +	/* Clear high bits from edi. Only keeping bits relevant to page
 +	   cross check. Note that we are using rax which is set in
 +	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 +	andl	$(PAGE_SIZE - 1), %edi
 +	/* Check if VEC_SIZE store cross page. Mask stores suffer
 +	   serious performance degradation when it has to fault suppress.  */
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 +	/* This is generally considered a cold target.  */
 +	ja	L(cross_page)
 +# if VEC_SIZE > 32
 +	movq	$-1, %rcx
 +	bzhiq	%rdx, %rcx, %rcx
 +	kmovq	%rcx, %k1
 +# else
 +	movl	$-1, %ecx
 +	bzhil	%edx, %ecx, %ecx
 +	kmovd	%ecx, %k1
 +# endif
 +	vmovdqu8 %VMM(0), (%rax){%k1}
 +	VZEROUPPER_RETURN
 +#endif
 +
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
@@ -183,54 +223,6 @@ L(last_2x_vec):
 #endif
 	VZEROUPPER_RETURN
 -	/* If have AVX512 mask instructions put L(less_vec) close to
 -	   entry as it doesn't take much space and is likely a hot target.
 -	 */
 -#ifdef USE_LESS_VEC_MASK_STORE
 -	.p2align 4,, 10
 -L(less_vec):
 -L(less_vec_from_wmemset):
 -	/* Less than 1 VEC.  */
 -# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 -#  error Unsupported VEC_SIZE!
 -# endif
 -	/* Clear high bits from edi. Only keeping bits relevant to page
 -	   cross check. Note that we are using rax which is set in
 -	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 -	andl	$(PAGE_SIZE - 1), %edi
 -	/* Check if VEC_SIZE store cross page. Mask stores suffer
 -	   serious performance degradation when it has to fault suppress.
 -	 */
 -	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 -	/* This is generally considered a cold target.  */
 -	ja	L(cross_page)
 -# if VEC_SIZE > 32
 -	movq	$-1, %rcx
 -	bzhiq	%rdx, %rcx, %rcx
 -	kmovq	%rcx, %k1
 -# else
 -	movl	$-1, %ecx
 -	bzhil	%edx, %ecx, %ecx
 -	kmovd	%ecx, %k1
 -# endif
 -	vmovdqu8 %VMM(0), (%rax){%k1}
 -	VZEROUPPER_RETURN
 -
 -# if defined USE_MULTIARCH && IS_IN (libc)
 -	/* Include L(stosb_local) here if including L(less_vec) between
 -	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
 -	   L(stosb_more_2x_vec) target.  */
 -	.p2align 4,, 10
 -L(stosb_local):
 -	movzbl	%sil, %eax
 -	mov	%RDX_LP, %RCX_LP
 -	mov	%RDI_LP, %RDX_LP
 -	rep	stosb
 -	mov	%RDX_LP, %RAX_LP
 -	VZEROUPPER_RETURN
 -# endif
 -#endif
 -
 #if defined USE_MULTIARCH && IS_IN (libc)
 	.p2align 4
 L(stosb_more_2x_vec):
@@ -316,21 +308,33 @@ L(return_vzeroupper):
 	ret
 #endif
 -	.p2align 4,, 10
 -#ifndef USE_LESS_VEC_MASK_STORE
 -# if defined USE_MULTIARCH && IS_IN (libc)
 +#ifdef USE_WITH_AVX2
 +	.p2align 4
 +#else
 +	.p2align 4,, 4
 +#endif
 +
 +#if defined USE_MULTIARCH && IS_IN (libc)
 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 	   range for 2-byte jump encoding.  */
 L(stosb_local):
 +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 +	jae	L(nt_memset)
 	movzbl	%sil, %eax
 	mov	%RDX_LP, %RCX_LP
 	mov	%RDI_LP, %RDX_LP
 	rep	stosb
 +# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
 +	/* Use xchg to save 1-byte (this helps align targets below).  */
 +	xchg	%RDX_LP, %RAX_LP
 +# else
 	mov	%RDX_LP, %RAX_LP
 -	VZEROUPPER_RETURN
 # endif
 +	VZEROUPPER_RETURN
 +#endif
 +#ifndef USE_LESS_VEC_MASK_STORE
 	/* Define L(less_vec) only if not otherwise defined.  */
 -	.p2align 4
 +	.p2align 4,, 12
 L(less_vec):
 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
 	   xmm). This is only does anything for AVX2.  */
@@ -421,4 +425,35 @@ L(between_2_3):
 	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 -END (MEMSET_SYMBOL (__memset, unaligned_erms))
 +
 +#if defined USE_MULTIARCH && IS_IN (libc)
 +# ifdef USE_WITH_AVX512
 +	/* Force align so the loop doesn't cross a cache-line.  */
 +	.p2align 4
 +# endif
 +	.p2align 4,, 7
 +    /* Memset using non-temporal stores.  */
 +L(nt_memset):
 +	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdi)
 +	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
 +    /* Align DST.  */
 +	orq	$(VEC_SIZE * 1 - 1), %rdi
 +	incq	%rdi
 +	.p2align 4,, 7
 +L(nt_loop):
 +	VMOVNT	%VMM(0), (VEC_SIZE * 0)(%rdi)
 +	VMOVNT	%VMM(0), (VEC_SIZE * 1)(%rdi)
 +	VMOVNT	%VMM(0), (VEC_SIZE * 2)(%rdi)
 +	VMOVNT	%VMM(0), (VEC_SIZE * 3)(%rdi)
 +	subq	$(VEC_SIZE * -4), %rdi
 +	cmpq	%rdx, %rdi
 +	jb	L(nt_loop)
 +	sfence
 +	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdx)
 +	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdx)
 +	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdx)
 +	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rdx)
 +	VZEROUPPER_RETURN
 +#endif
 +
 +END(MEMSET_SYMBOL(__memset, unaligned_erms))
 -- 
 2.27.0
--- a/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
+++ b/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
@ -0,0 +1,34 @@
 From 5a64f933655384477d85122c6855dc6d84061810 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Wed, 1 Nov 2023 15:30:26 -0500
 Subject: [PATCH] x86: Only align destination to 1x VEC_SIZE in memset 4x
 loop
 Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on
 performance other than potentially resulting in an additional
 iteration of the loop.
 1x maintains aligned stores (the only reason to align in this case)
 and doesn't incur any unnecessary loop iterations.
 Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
 (cherry picked from commit 9469261cf1924d350feeec64d2c80cafbbdcdd4d)
 ---
 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index 3d9ad49cb9..0f0636b90f 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -293,7 +293,7 @@ L(more_2x_vec):
 	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
 #endif
 	/* Align dst for loop.  */
 -	andq	$(VEC_SIZE * -2), %LOOP_REG
 +	andq	$(VEC_SIZE * -1), %LOOP_REG
 	.p2align 4
 L(loop):
 	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
 -- 
 2.27.0
--- a/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
+++ b/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
@ -0,0 +1,149 @@
 From 12fec8aae5e17cc4dc3bb079265c46ee78faeddb Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 27 Sep 2024 15:50:10 -0700
 Subject: [PATCH] x86/string: Fixup alignment of main loop in
 str{n}cmp-evex [BZ #32212]
 The loop should be aligned to 32-bytes so that it can ideally run out
 the DSB. This is particularly important on Skylake-Server where
 deficiencies in it's DSB implementation make it prone to not being
 able to run loops out of the DSB.
 For example running strcmp-evex on 200Mb string:
 32-byte aligned loop:
    - 43,399,578,766      idq.dsb_uops
 not 32-byte aligned loop:
    - 6,060,139,704       idq.dsb_uops
 This results in a 25% performance degradation for the non-aligned
 version.
 The fix is to just ensure the code layout is such that the loop is
 aligned. (Which was previously the case but was accidentally dropped
 in 84e7c46df).
 NB: The fix was actually 64-byte alignment. This is because 64-byte
 alignment generally produces more stable performance than 32-byte
 aligned code (cache line crosses can affect perf), so if we are going
 past 16-byte alignmnent, might as well go to 64. 64-byte alignment
 also matches most other functions we over-align, so it creates a
 common point of optimization.
 Times are reported as ratio of Time_With_Patch /
 Time_Without_Patch. Lower is better.
 The values being reported is the geometric mean of the ratio across
 all tests in bench-strcmp and bench-strncmp.
 Note this patch is only attempting to improve the Skylake-Server
 strcmp for long strings. The rest of the numbers are only to test for
 regressions.
 Tigerlake Results Strings <= 512:
    strcmp : 1.026
    strncmp: 0.949
 Tigerlake Results Strings > 512:
    strcmp : 0.994
    strncmp: 0.998
 Skylake-Server Results Strings <= 512:
    strcmp : 0.945
    strncmp: 0.943
 Skylake-Server Results Strings > 512:
    strcmp : 0.778
    strncmp: 1.000
 The 2.6% regression on TGL-strcmp is due to slowdowns caused by
 changes in alignment of code handling small sizes (most on the
 page-cross logic). These should be safe to ignore because 1) We
 previously only 16-byte aligned the function so this behavior is not
 new and was essentially up to chance before this patch and 2) this
 type of alignment related regression on small sizes really only comes
 up in tight micro-benchmark loops and is unlikely to have any affect
 on realworld performance.
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 (cherry picked from commit 483443d3211532903d7e790211af5a1d55fdb1f3)
 ---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 index ae39cdf217..6a7fec669e 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -209,7 +209,9 @@
    returned.  */
 	.section SECTION(.text), "ax", @progbits
 -	.align	16
 +	/* Align 64 bytes here. This is to get the L(loop) block ideally
 +	   aligned for the DSB.  */
 +	.align	64
 	.type	STRCMP, @function
 	.globl	STRCMP
 # ifdef USE_AS_STRCASECMP_L
@@ -509,9 +511,7 @@ L(ret4):
 	ret
 # endif
 -	/* 32 byte align here ensures the main loop is ideally aligned
 -	   for DSB.  */
 -	.p2align 5
 +	.p2align 4,, 4
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
@@ -1426,10 +1426,9 @@ L(less_32_till_page):
 L(ret_zero_page_cross_slow_case0):
 	xorl	%eax, %eax
 	ret
 -# endif
 -
 -
 +# else
 	.p2align 4,, 10
 +# endif
 L(less_16_till_page):
 	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
@@ -1482,8 +1481,12 @@ L(less_16_till_page):
 # endif
 	jmp	L(prepare_loop_aligned)
 -
 -
 +# ifndef USE_AS_STRNCMP
 +	/* Fits in aligning bytes.  */
 +L(ret_zero_4_loop):
 +	xorl	%eax, %eax
 +	ret
 +# endif
 	.p2align 4,, 10
 L(less_8_till_page):
@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs):
 #  ifdef USE_AS_STRNCMP
 	.p2align 4,, 2
 +L(ret_zero_4_loop):
 L(ret_zero_page_cross_slow_case1):
 	xorl	%eax, %eax
 	ret
@@ -1586,10 +1590,6 @@ L(less_4_loop):
 	subq	$-(CHAR_PER_VEC * 4), %rdx
 #  endif
 	jmp	L(prepare_loop_aligned)
 -
 -L(ret_zero_4_loop):
 -	xorl	%eax, %eax
 -	ret
 L(ret_less_4_loop):
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
 -- 
 2.27.0
--- a/x86_64-Add-expm1-with-FMA.patch
+++ b/x86_64-Add-expm1-with-FMA.patch
@ -0,0 +1,135 @@
 From b2a45f1eee39d67c1fff2d697d32857fb13c8c5d Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 11 Aug 2023 08:04:08 -0700
 Subject: [PATCH] x86_64: Add expm1 with FMA
 On Skylake, it improves expm1 bench performance by:
        Before       After     Improvement
 max     70.204       68.054       3%
 min     20.709       16.2         22%
 mean    22.1221      16.7367      24%
 NB: Add
 extern long double __expm1l (long double);
 extern long double __expm1f128 (long double);
 for __typeof (__expm1l) and __typeof (__expm1f128) when __expm1 is
 defined since __expm1 may be expanded in their declarations which
 causes the build failure.
 (cherry picked from commit 1b214630ce6f7e0099b8b6f87246246739b079cf)
 ---
 sysdeps/ieee754/dbl-64/s_expm1.c           |  7 +++++
 sysdeps/x86_64/fpu/multiarch/Makefile      |  2 ++
 sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c | 10 ++++++
 sysdeps/x86_64/fpu/multiarch/s_expm1.c     | 36 ++++++++++++++++++++++
 4 files changed, 55 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1.c
 diff --git a/sysdeps/ieee754/dbl-64/s_expm1.c b/sysdeps/ieee754/dbl-64/s_expm1.c
 index 8f1c95bd04..1cafeca9c0 100644
 --- a/sysdeps/ieee754/dbl-64/s_expm1.c
 +++ b/sysdeps/ieee754/dbl-64/s_expm1.c
@@ -130,6 +130,11 @@ static const double
 	  4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
 	  -2.01099218183624371326e-07 }; /* BE8AFDB7 6E09C32D */
 +#ifndef SECTION
 +# define SECTION
 +#endif
 +
 +SECTION
 double
 __expm1 (double x)
 {
@@ -258,4 +263,6 @@ __expm1 (double x)
     }
   return y;
 }
 +#ifndef __expm1
 libm_alias_double (__expm1, expm1)
 +#endif
 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
 index f773255721..add339a876 100644
 --- a/sysdeps/x86_64/fpu/multiarch/Makefile
 +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -37,6 +37,7 @@ libm-sysdep_routines += \
   e_log2-fma \
   e_pow-fma \
   s_atan-fma \
 +  s_expm1-fma \
   s_sin-fma \
   s_sincos-fma \
   s_tan-fma \
@@ -49,6 +50,7 @@ CFLAGS-e_log-fma.c = -mfma -mavx2
 CFLAGS-e_log2-fma.c = -mfma -mavx2
 CFLAGS-e_pow-fma.c = -mfma -mavx2
 CFLAGS-s_atan-fma.c = -mfma -mavx2
 +CFLAGS-s_expm1-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
 CFLAGS-s_sincos-fma.c = -mfma -mavx2
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
 new file mode 100644
 index 0000000000..3ee2bd804e
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
@@ -0,0 +1,10 @@
 +#define __expm1 __expm1_fma
 +
 +/* NB: __expm1 may be expanded to __expm1_fma in the following
 +   prototypes.  */
 +extern long double __expm1l (long double);
 +extern long double __expm1f128 (long double);
 +
 +#define SECTION __attribute__ ((section (".text.fma")))
 +
 +#include <sysdeps/ieee754/dbl-64/s_expm1.c>
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1.c b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
 new file mode 100644
 index 0000000000..2cae83fb7f
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
@@ -0,0 +1,36 @@
 +/* Multiple versions of expm1.
 +   Copyright (C) 2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <libm-alias-double.h>
 +
 +extern double __redirect_expm1 (double);
 +
 +#define SYMBOL_NAME expm1
 +#include "ifunc-fma.h"
 +
 +libc_ifunc_redirected (__redirect_expm1, __expm1, IFUNC_SELECTOR ());
 +libm_alias_double (__expm1, expm1)
 +
 +#define __expm1 __expm1_sse2
 +
 +/* NB: __expm1 may be expanded to __expm1_sse2 in the following
 +   prototypes.  */
 +extern long double __expm1l (long double);
 +extern long double __expm1f128 (long double);
 +
 +#include <sysdeps/ieee754/dbl-64/s_expm1.c>
 -- 
 2.27.0
--- a/x86_64-Add-log1p-with-FMA.patch
+++ b/x86_64-Add-log1p-with-FMA.patch
@ -0,0 +1,140 @@
 From c92946d9b29956be78ca4487264848714fd5d505 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Thu, 17 Aug 2023 09:42:29 -0700
 Subject: [PATCH] x86_64: Add log1p with FMA
 On Skylake, it changes log1p bench performance by:
        Before       After     Improvement
 max     63.349       58.347       8%
 min     4.448        5.651        -30%
 mean    12.0674      10.336       14%
 The minimum code path is
 if (hx < 0x3FDA827A)                          /* x < 0.41422  */
    {
      if (__glibc_unlikely (ax >= 0x3ff00000))           /* x <= -1.0 */
        {
 	   ...
        }
      if (__glibc_unlikely (ax < 0x3e200000))           /* |x| < 2**-29 */
        {
          math_force_eval (two54 + x);          /* raise inexact */
          if (ax < 0x3c900000)                  /* |x| < 2**-54 */
            {
 	      ...
            }
          else
            return x - x * x * 0.5;
 FMA and non-FMA code sequences look similar.  Non-FMA version is slightly
 faster.  Since log1p is called by asinh and atanh, it improves asinh
 performance by:
        Before       After     Improvement
 max     75.645       63.135       16%
 min     10.074       10.071       0%
 mean    15.9483      14.9089      6%
 and improves atanh performance by:
        Before       After     Improvement
 max     91.768       75.081       18%
 min     15.548       13.883       10%
 mean    18.3713      16.8011      8%
 (cherry picked from commit a8ecb126d4c26c52f4ad828c566afe4043a28155)
 ---
 sysdeps/ieee754/dbl-64/s_log1p.c           |  5 ++++
 sysdeps/x86_64/fpu/multiarch/Makefile      |  2 ++
 sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c |  4 +++
 sysdeps/x86_64/fpu/multiarch/s_log1p.c     | 29 ++++++++++++++++++++++
 4 files changed, 40 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p.c
 diff --git a/sysdeps/ieee754/dbl-64/s_log1p.c b/sysdeps/ieee754/dbl-64/s_log1p.c
 index e6476a8260..eeb0af859f 100644
 --- a/sysdeps/ieee754/dbl-64/s_log1p.c
 +++ b/sysdeps/ieee754/dbl-64/s_log1p.c
@@ -99,6 +99,11 @@ static const double
 static const double zero = 0.0;
 +#ifndef SECTION
 +# define SECTION
 +#endif
 +
 +SECTION
 double
 __log1p (double x)
 {
 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
 index add339a876..ea81753b70 100644
 --- a/sysdeps/x86_64/fpu/multiarch/Makefile
 +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -38,6 +38,7 @@ libm-sysdep_routines += \
   e_pow-fma \
   s_atan-fma \
   s_expm1-fma \
 +  s_log1p-fma \
   s_sin-fma \
   s_sincos-fma \
   s_tan-fma \
@@ -51,6 +52,7 @@ CFLAGS-e_log2-fma.c = -mfma -mavx2
 CFLAGS-e_pow-fma.c = -mfma -mavx2
 CFLAGS-s_atan-fma.c = -mfma -mavx2
 CFLAGS-s_expm1-fma.c = -mfma -mavx2
 +CFLAGS-s_log1p-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
 CFLAGS-s_sincos-fma.c = -mfma -mavx2
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
 new file mode 100644
 index 0000000000..8952df8f9e
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
@@ -0,0 +1,4 @@
 +#define __log1p __log1p_fma
 +#define SECTION __attribute__ ((section (".text.fma")))
 +
 +#include <sysdeps/ieee754/dbl-64/s_log1p.c>
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p.c b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
 new file mode 100644
 index 0000000000..6ce5198d6d
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
@@ -0,0 +1,29 @@
 +/* Multiple versions of log1p.
 +   Copyright (C) 2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <libm-alias-double.h>
 +
 +extern double __redirect_log1p (double);
 +
 +#define SYMBOL_NAME log1p
 +#include "ifunc-fma.h"
 +
 +libc_ifunc_redirected (__redirect_log1p, __log1p, IFUNC_SELECTOR ());
 +
 +#define __log1p __log1p_sse2
 +#include <sysdeps/ieee754/dbl-64/s_log1p.c>
 -- 
 2.27.0
--- a/x86_64-Add-log2-with-FMA.patch
+++ b/x86_64-Add-log2-with-FMA.patch
@ -0,0 +1,102 @@
 From 49016f2190693d5b2d4d6294d438ebae7a58d151 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Thu, 10 Aug 2023 11:24:30 -0700
 Subject: [PATCH] x86_64: Add log2 with FMA
 On Skylake, it improves log2 bench performance by:
        Before       After     Improvement
 max     208.779      63.827       69%
 min     9.977        6.55         34%
 mean    10.366       6.8191       34%
 (cherry picked from commit f6b10ed8e9a00de49d0951e760cc2b5288862b47)
 ---
 sysdeps/x86_64/fpu/multiarch/Makefile     |  2 ++
 sysdeps/x86_64/fpu/multiarch/e_log2-fma.c |  3 ++
 sysdeps/x86_64/fpu/multiarch/e_log2.c     | 43 +++++++++++++++++++++++
 3 files changed, 48 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2.c
 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
 index e37e488c37..f773255721 100644
 --- a/sysdeps/x86_64/fpu/multiarch/Makefile
 +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -34,6 +34,7 @@ libm-sysdep_routines += \
   e_atan2-fma \
   e_exp-fma \
   e_log-fma \
 +  e_log2-fma \
   e_pow-fma \
   s_atan-fma \
   s_sin-fma \
@@ -45,6 +46,7 @@ CFLAGS-e_asin-fma.c = -mfma -mavx2
 CFLAGS-e_atan2-fma.c = -mfma -mavx2
 CFLAGS-e_exp-fma.c = -mfma -mavx2
 CFLAGS-e_log-fma.c = -mfma -mavx2
 +CFLAGS-e_log2-fma.c = -mfma -mavx2
 CFLAGS-e_pow-fma.c = -mfma -mavx2
 CFLAGS-s_atan-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
 new file mode 100644
 index 0000000000..9fbebc1b47
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
@@ -0,0 +1,3 @@
 +#define __log2 __log2_fma
 +
 +#include <sysdeps/ieee754/dbl-64/e_log2.c>
 diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2.c b/sysdeps/x86_64/fpu/multiarch/e_log2.c
 new file mode 100644
 index 0000000000..c0320caf36
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/e_log2.c
@@ -0,0 +1,43 @@
 +/* Multiple versions of log2.
 +   Copyright (C) 2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <libm-alias-double.h>
 +#include <libm-alias-finite.h>
 +
 +extern double __redirect_log2 (double);
 +
 +#define SYMBOL_NAME log2
 +#include "ifunc-fma.h"
 +
 +libc_ifunc_redirected (__redirect_log2, __log2, IFUNC_SELECTOR ());
 +
 +#ifdef SHARED
 +__hidden_ver1 (__log2, __GI___log2, __redirect_log2)
 +  __attribute__ ((visibility ("hidden")));
 +
 +versioned_symbol (libm, __ieee754_log2, log2, GLIBC_2_29);
 +libm_alias_double_other (__log2, log2)
 +#else
 +libm_alias_double (__log2, log2)
 +#endif
 +
 +strong_alias (__log2, __ieee754_log2)
 +libm_alias_finite (__log2, __log2)
 +
 +#define __log2 __log2_sse2
 +#include <sysdeps/ieee754/dbl-64/e_log2.c>
 -- 
 2.27.0
--- a/x86_64-Fix-missing-wcsncat-function-definition-witho.patch
+++ b/x86_64-Fix-missing-wcsncat-function-definition-witho.patch
@ -0,0 +1,44 @@
 From dc1762113dbe40be832bedd41b52d9822d62c50f Mon Sep 17 00:00:00 2001
 From: Gabi Falk <gabifalk@gmx.com>
 Date: Tue, 7 May 2024 18:25:00 +0000
 Subject: [PATCH] x86_64: Fix missing wcsncat function definition without
 multiarch (x86-64-v4)
 This code expects the WCSCAT preprocessor macro to be predefined in case
 the evex implementation of the function should be defined with a name
 different from __wcsncat_evex.  However, when glibc is built for
 x86-64-v4 without multiarch support, sysdeps/x86_64/wcsncat.S defines
 WCSNCAT variable instead of WCSCAT to build it as wcsncat.  Rename the
 variable to WCSNCAT, as it is actually a better naming choice for the
 variable in this case.
 Reported-by: Kenton Groombridge
 Link: https://bugs.gentoo.org/921945
 Fixes: 64b8b6516b ("x86: Add evex optimized functions for the wchar_t strcpy family")
 Signed-off-by: Gabi Falk <gabifalk@gmx.com>
 Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
 (cherry picked from commit dd5f891c1ad9f1b43b9db93afe2a55cbb7a6194e)
 ---
 sysdeps/x86_64/multiarch/wcsncat-evex.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
 index 392215950a..10bfb0a531 100644
 --- a/sysdeps/x86_64/multiarch/wcsncat-evex.S
 +++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
@@ -1,9 +1,9 @@
 -#ifndef WCSCAT
 -# define WCSCAT	__wcsncat_evex
 +#ifndef WCSNCAT
 +# define WCSNCAT	__wcsncat_evex
 #endif
 #define USE_AS_WCSCPY
 #define USE_AS_STRCAT
 -#define STRNCAT	WCSCAT
 +#define STRNCAT	WCSNCAT
 #include "strncat-evex.S"
 -- 
 2.27.0
--- a/x86_64-Sort-fpu-multiarch-Makefile.patch
+++ b/x86_64-Sort-fpu-multiarch-Makefile.patch
@ -0,0 +1,144 @@
 From 5c9be512ee25ceab92a284adc75fe22bbd94b179 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Wed, 9 Aug 2023 11:08:52 -0700
 Subject: [PATCH] x86_64: Sort fpu/multiarch/Makefile
 Sort Makefile variables using scripts/sort-makefile-lines.py.
 No code generation changes observed in libm.  No regressions on x86_64.
 (cherry picked from commit 881546979d0219c18337e1b4f4d00cfacab13c40)
 ---
 sysdeps/x86_64/fpu/multiarch/Makefile | 94 +++++++++++++++++++++------
 1 file changed, 74 insertions(+), 20 deletions(-)
 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
 index 248162525b..e37e488c37 100644
 --- a/sysdeps/x86_64/fpu/multiarch/Makefile
 +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,17 +1,45 @@
 ifeq ($(subdir),math)
 -libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
 -			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
 -			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
 +libm-sysdep_routines += \
 +  s_ceil-c \
 +  s_ceilf-c \
 +  s_floor-c \
 +  s_floorf-c \
 +  s_rint-c \
 +  s_rintf-c \
 +  s_nearbyint-c \
 +  s_nearbyintf-c \
 +  s_roundeven-c \
 +  s_roundevenf-c \
 +  s_trunc-c \
 +  s_truncf-c \
 +# libm-sysdep_routines
 -libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
 -			s_floorf-sse4_1 s_nearbyint-sse4_1 \
 -			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
 -			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
 -			s_trunc-sse4_1 s_truncf-sse4_1
 +libm-sysdep_routines += \
 +  s_ceil-sse4_1 \
 +  s_ceilf-sse4_1 \
 +  s_floor-sse4_1 \
 +  s_floorf-sse4_1 \
 +  s_nearbyint-sse4_1 \
 +  s_nearbyintf-sse4_1 \
 +  s_roundeven-sse4_1 \
 +  s_roundevenf-sse4_1 \
 +  s_rint-sse4_1 \
 +  s_rintf-sse4_1 \
 +  s_trunc-sse4_1 \
 +  s_truncf-sse4_1 \
 +# libm-sysdep_routines
 -libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
 -			e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
 -			s_sincos-fma
 +libm-sysdep_routines += \
 +  e_asin-fma \
 +  e_atan2-fma \
 +  e_exp-fma \
 +  e_log-fma \
 +  e_pow-fma \
 +  s_atan-fma \
 +  s_sin-fma \
 +  s_sincos-fma \
 +  s_tan-fma \
 +# libm-sysdep_routines
 CFLAGS-e_asin-fma.c = -mfma -mavx2
 CFLAGS-e_atan2-fma.c = -mfma -mavx2
@@ -23,10 +51,22 @@ CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
 CFLAGS-s_sincos-fma.c = -mfma -mavx2
 -libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
 +libm-sysdep_routines += \
 +  s_cosf-sse2 \
 +  s_sincosf-sse2 \
 +  s_sinf-sse2 \
 +# libm-sysdep_routines
 -libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \
 -			e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma
 +libm-sysdep_routines += \
 +  e_exp2f-fma \
 +  e_expf-fma \
 +  e_log2f-fma \
 +  e_logf-fma \
 +  e_powf-fma \
 +  s_cosf-fma \
 +  s_sincosf-fma \
 +  s_sinf-fma \
 +# libm-sysdep_routines
 CFLAGS-e_exp2f-fma.c = -mfma -mavx2
 CFLAGS-e_expf-fma.c = -mfma -mavx2
@@ -37,9 +77,17 @@ CFLAGS-s_sinf-fma.c = -mfma -mavx2
 CFLAGS-s_cosf-fma.c = -mfma -mavx2
 CFLAGS-s_sincosf-fma.c = -mfma -mavx2
 -libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
 -			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
 -			s_sincos-fma4
 +libm-sysdep_routines += \
 +  e_exp-fma4 \
 +  e_log-fma4 \
 +  e_pow-fma4 \
 +  e_asin-fma4 \
 +  s_atan-fma4 \
 +  e_atan2-fma4 \
 +  s_sin-fma4 \
 +  s_sincos-fma4 \
 +  s_tan-fma4 \
 +# libm-sysdep_routines
 CFLAGS-e_asin-fma4.c = -mfma4
 CFLAGS-e_atan2-fma4.c = -mfma4
@@ -51,9 +99,15 @@ CFLAGS-s_sin-fma4.c = -mfma4
 CFLAGS-s_tan-fma4.c = -mfma4
 CFLAGS-s_sincos-fma4.c = -mfma4
 -libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
 -			e_atan2-avx s_sin-avx s_tan-avx \
 -			s_sincos-avx
 +libm-sysdep_routines += \
 +  e_exp-avx \
 +  e_log-avx \
 +  s_atan-avx \
 +  e_atan2-avx \
 +  s_sin-avx \
 +  s_sincos-avx \
 +  s_tan-avx \
 +# libm-sysdep_routines
 CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
 -- 
 2.27.0