fpu: x86-64 optimize load of all bits set into ZMM register.
This commit is contained in:
parent
9564aeebe9
commit
018de895ca
@ -63,7 +63,7 @@
|
||||
##############################################################################
|
||||
Name: glibc
|
||||
Version: 2.34
|
||||
Release: 12
|
||||
Release: 13
|
||||
Summary: The GNU libc libraries
|
||||
License: %{all_license}
|
||||
URL: http://www.gnu.org/software/glibc/
|
||||
@ -111,6 +111,7 @@ Patch26: riscv-Drop-reliance-on-_GLOBAL_OFFSET_TABLE_-0.patch
|
||||
Patch27: x86_64-Simplify-elf_machine_-load_address-dynamic.patch
|
||||
Patch28: x86-fix-Autoconf-caching-of-instruction-support-chec.patch
|
||||
Patch29: Update-string-test-memmove.c-to-cover-16KB-copy.patch
|
||||
Patch30: x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
|
||||
|
||||
#Patch9000: turn-REP_STOSB_THRESHOLD-from-2k-to-1M.patch
|
||||
Patch9001: delete-no-hard-link-to-avoid-all_language-package-to.patch
|
||||
@ -1271,6 +1272,9 @@ fi
|
||||
%doc hesiod/README.hesiod
|
||||
|
||||
%changelog
|
||||
* Mon Oct 25 2021 Qingqing Li<liqingqing3@huawei.com> - 2.34-13
|
||||
- fpu: x86-64 optimize load of all bits set into ZMM register.
|
||||
|
||||
* Tue Oct 19 2021 Yang Yanchao <yangyanchao6@huawei.com> - 2.34-12
|
||||
- Add locale-archive sub packages to support more languages
|
||||
and reduce memory usage.
|
||||
|
||||
267
x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
Normal file
267
x86-64-Optimize-load-of-all-bits-set-into-ZMM-regist.patch
Normal file
@ -0,0 +1,267 @@
|
||||
From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 20 Aug 2021 06:42:24 -0700
|
||||
Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
|
||||
#28252]
|
||||
|
||||
Optimize loads of all bits set into ZMM register in AVX512 SVML codes
|
||||
by replacing
|
||||
|
||||
vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
|
||||
|
||||
and
|
||||
|
||||
vmovups .L_2il0floatpacket.13(%rip), %zmmX
|
||||
|
||||
with
|
||||
vpternlogd $0xff, %zmmX, %zmmX, %zmmX
|
||||
|
||||
This fixes BZ #28252.
|
||||
---
|
||||
sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
|
||||
sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
|
||||
10 files changed, 11 insertions(+), 64 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
index e68fcdb..58e588a 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
|
||||
vmovaps %zmm0, %zmm8
|
||||
|
||||
/* Check for large arguments path */
|
||||
- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
|
||||
+ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
|
||||
|
||||
/*
|
||||
ARGUMENT RANGE REDUCTION:
|
||||
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_cos_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.16:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.16,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
index dfa2aca..f5f117d 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
|
||||
|
||||
/* preserve mantissa, set input exponent to 2^(-10) */
|
||||
vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
|
||||
- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
|
||||
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
|
||||
vpsrlq $32, %zmm4, %zmm6
|
||||
|
||||
/* reciprocal approximation good to at least 11 bits */
|
||||
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_log_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.12:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.12,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
index be8ab7c..48d251d 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
|
||||
andq $-64, %rsp
|
||||
subq $1280, %rsp
|
||||
movq __svml_d_trig_data@GOTPCREL(%rip), %rax
|
||||
- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
|
||||
vmovups __dAbsMask(%rax), %zmm7
|
||||
vmovups __dInvPI(%rax), %zmm2
|
||||
vmovups __dRShifter(%rax), %zmm1
|
||||
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_sin_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.14:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.14,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
index 6118870..a4944a4 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
|
||||
|
||||
/* SinPoly = SinR*SinPoly */
|
||||
vfmadd213pd %zmm5, %zmm5, %zmm4
|
||||
- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
|
||||
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
|
||||
|
||||
/* Update Cos result's sign */
|
||||
vxorpd %zmm2, %zmm1, %zmm1
|
||||
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
|
||||
ENTRY (_ZGVeN8vvv_sincos_skx)
|
||||
WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
|
||||
END (_ZGVeN8vvv_sincos_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.15:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.15,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
index f671d60..fe8474f 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
|
||||
X = X - Y*PI1 - Y*PI2 - Y*PI3
|
||||
*/
|
||||
vmovaps %zmm0, %zmm6
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm12
|
||||
+ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
|
||||
vmovups __sRShifter(%rax), %zmm3
|
||||
vmovups __sPI1_FMA(%rax), %zmm5
|
||||
vmovups __sA9_FMA(%rax), %zmm9
|
||||
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16v_cosf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
index 637bfe3..229b782 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||
vmovaps %zmm0, %zmm7
|
||||
|
||||
/* compare against threshold */
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm3
|
||||
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
|
||||
vmovups __sInvLn2(%rax), %zmm4
|
||||
vmovups __sShifter(%rax), %zmm1
|
||||
vmovups __sLn2hi(%rax), %zmm6
|
||||
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||
|
||||
#endif
|
||||
END (_ZGVeN16v_expf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
index 9d790fb..fa2aae9 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
|
||||
andq $-64, %rsp
|
||||
subq $1280, %rsp
|
||||
movq __svml_slog_data@GOTPCREL(%rip), %rax
|
||||
- vmovups .L_2il0floatpacket.7(%rip), %zmm6
|
||||
+ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
|
||||
vmovups _iBrkValue(%rax), %zmm4
|
||||
vmovups _sPoly_7(%rax), %zmm8
|
||||
|
||||
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
|
||||
|
||||
#endif
|
||||
END (_ZGVeN16v_logf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.7:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.7,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
index c5c43c4..6aea2a4 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
vpsrlq $32, %zmm3, %zmm2
|
||||
vpmovqd %zmm2, %ymm11
|
||||
vcvtps2pd %ymm14, %zmm13
|
||||
- vmovups .L_2il0floatpacket.23(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
vmovaps %zmm14, %zmm26
|
||||
vpandd _ABSMASK(%rax), %zmm1, %zmm8
|
||||
vpcmpd $1, _INF(%rax), %zmm8, %k2
|
||||
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
vpmovqd %zmm11, %ymm5
|
||||
vpxord %zmm10, %zmm10, %zmm10
|
||||
vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
|
||||
- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
|
||||
+ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
|
||||
vpxord %zmm11, %zmm11, %zmm11
|
||||
vcvtdq2pd %ymm7, %zmm7
|
||||
vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
|
||||
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16vv_powf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.23:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.23,@object
|
||||
-.L_2il0floatpacket.24:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.24,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
index 9cf359c..a446c50 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
|
||||
|
||||
/* Result sign calculations */
|
||||
vpternlogd $150, %zmm0, %zmm14, %zmm1
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
|
||||
/* Add correction term 0.5 for cos() part */
|
||||
vaddps %zmm8, %zmm5, %zmm15
|
||||
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
|
||||
ENTRY (_ZGVeN16vvv_sincosf_skx)
|
||||
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
|
||||
END (_ZGVeN16vvv_sincosf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
index bd05109..c1b352d 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
|
||||
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
|
||||
|
||||
/* Check for large and special values */
|
||||
- vmovups .L_2il0floatpacket.11(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
vmovups __sAbsMask(%rax), %zmm5
|
||||
vmovups __sInvPI(%rax), %zmm1
|
||||
vmovups __sRShifter(%rax), %zmm2
|
||||
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16v_sinf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.11:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.11,@object
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user