From 0dfa5db2106d75db595e83f064352fb89d92986e Mon Sep 17 00:00:00 2001 From: wangbin224 Date: Sat, 28 Mar 2020 19:14:41 +0800 Subject: [PATCH] glibc: backport Kunpeng patches backport Kunpeng patches Signed-off-by: wangbin224 --- manual/tunables.texi | 2 +- sysdeps/aarch64/memcmp.S | 4 +- sysdeps/aarch64/memrchr.S | 15 +- sysdeps/aarch64/multiarch/Makefile | 2 +- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 54 +- sysdeps/aarch64/multiarch/memcpy.c | 9 +- sysdeps/aarch64/multiarch/memcpy_kunpeng.S | 576 ------------------ sysdeps/aarch64/multiarch/memmove.c | 11 +- sysdeps/aarch64/multiarch/memset.c | 14 +- sysdeps/aarch64/multiarch/memset_kunpeng.S | 58 +- sysdeps/aarch64/strcpy.S | 6 +- sysdeps/aarch64/strnlen.S | 4 +- .../unix/sysv/linux/aarch64/cpu-features.c | 4 +- .../unix/sysv/linux/aarch64/cpu-features.h | 7 +- 14 files changed, 86 insertions(+), 680 deletions(-) delete mode 100755 sysdeps/aarch64/multiarch/memcpy_kunpeng.S diff --git a/manual/tunables.texi b/manual/tunables.texi index bb4819bd..124b39b6 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -333,7 +333,7 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le. The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to assume that the CPU is @code{xxx} where xxx may have one of these values: @code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99}, -@code{thunderx2t99p1}. +@code{thunderx2t99p1}, @code{kunpeng}. This tunable is specific to aarch64. @end deftp diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 04129d83..a2138616 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -1,6 +1,6 @@ /* memcmp - compare memory - Copyright (C) 2013-2019 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,7 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see - . */ + . */ #include diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S index 9095304b..0565168a 100644 --- a/sysdeps/aarch64/memrchr.S +++ b/sysdeps/aarch64/memrchr.S @@ -16,8 +16,8 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see - . */ - + . */ + #include /* Assumptions: @@ -61,7 +61,7 @@ * things occur in the original string, counting trailing zeros allows to * identify exactly which byte has matched. */ - + ENTRY (__memrchr) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) @@ -101,7 +101,7 @@ ENTRY (__memrchr) addp vend.16b, vend.16b, vend.16b /* 128->64 */ mov synd, vend.2d[0] /* Clear the (32-soff)*2 upper bits */ - lsl tmp, soff, #1 + lsl tmp, soff, #1 lsl synd, synd, tmp lsr synd, synd, tmp /* The first block can also be the last */ @@ -135,16 +135,16 @@ L(end): b.hi L(tail) L(masklast): - /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */ + /* Clear the (32 - ((cntrem + (32-soff)) % 32)) * 2 lower bits */ add tmp, cntrem, soff and tmp, tmp, #31 sub tmp, tmp, #32 - neg tmp, tmp, lsl #1 + neg tmp, tmp, lsl #1 lsr synd, synd, tmp lsl synd, synd, tmp L(tail): - /* Compensate the last post-increment*/ + /* Compensate the last post-increment*/ add seek_dst, seek_dst, #32 /* Check that we have found a character */ cmp synd, #0 @@ -163,4 +163,3 @@ L(zero_length): END (__memrchr) weak_alias (__memrchr, memrchr) libc_hidden_builtin_def (memrchr) - diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 90529d40..722ed824 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,4 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_kunpeng memcpy_generic memcpy_thunderx memcpy_thunderx2 \ +sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor memset_generic memset_falkor memset_kunpeng endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index bef9b06d..0026dbba 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -1,5 +1,5 @@ /* Enumerate available IFUNC implementations of a function. AARCH64 version. - Copyright (C) 2017-2019 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -25,36 +25,34 @@ #include /* Maximum number of IFUNC implementations. */ -#define MAX_IFUNC 5 +#define MAX_IFUNC 4 size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, size_t max) { - assert(max >= MAX_IFUNC); - - size_t i = 0; - - INIT_ARCH(); - - /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */ - IFUNC_IMPL(i, name, memcpy, - IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_thunderx) - IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_thunderx2) - IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_falkor) - IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_kunpeng) - IFUNC_IMPL_ADD(array, i, memcpy, 1, __memcpy_generic)) - IFUNC_IMPL(i, name, memmove, - IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_thunderx) - IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_falkor) - IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_kunpeng) - IFUNC_IMPL_ADD(array, i, memmove, 1, __memmove_generic)) - IFUNC_IMPL(i, name, memset, - /* Enable this on non-falkor processors too so that other cores - can do a comparative analysis with __memset_generic. */ - IFUNC_IMPL_ADD(array, i, memset, (zva_size == 64), __memset_falkor) - IFUNC_IMPL_ADD(array, i, memset, 1, __memset_generic) - IFUNC_IMPL_ADD(array, i, memset, 1, __memset_kunpeng)) - - return i; + assert (max >= MAX_IFUNC); + + size_t i = 0; + + INIT_ARCH (); + + /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) + IFUNC_IMPL (i, name, memset, + /* Enable this on non-falkor processors too so that other cores + can do a comparative analysis with __memset_generic. */ + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + + return i; } diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 150e1ca9..2d358a83 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -1,5 +1,5 @@ /* Multiple versions of memcpy. AARCH64 version. - Copyright (C) 2017-2019 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -32,14 +32,11 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; -extern __typeof (__redirect_memcpy) __memcpy_kunpeng attribute_hidden; libc_ifunc (__libc_memcpy, - IS_KUNPENG920(midr) - ?__memcpy_kunpeng - : (IS_THUNDERX (midr) + (IS_THUNDERX (midr) ? __memcpy_thunderx - : (IS_FALKOR (midr) || IS_PHECDA (midr) + : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr) ? __memcpy_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memcpy_thunderx2 diff --git a/sysdeps/aarch64/multiarch/memcpy_kunpeng.S b/sysdeps/aarch64/multiarch/memcpy_kunpeng.S deleted file mode 100755 index 2102478a..00000000 --- a/sysdeps/aarch64/multiarch/memcpy_kunpeng.S +++ /dev/null @@ -1,576 +0,0 @@ -/* A Kunpeng Optimized memcpy implementation for AARCH64. - Copyright (C) 2018-2019 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define tmp2 x6 -#define tmp3 x7 -#define tmp3w w7 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define G_l count -#define G_h dst -#define tmp1 x14 - -#define A_q q0 -#define B_q q1 -#define C_q q2 -#define D_q q3 -#define E_q q4 -#define F_q q5 -#define G_q q6 -#define H_q q7 -#define I_q q16 -#define J_q q17 - -#define A_v v0 -#define B_v v1 -#define C_v v2 -#define D_v v3 -#define E_v v4 -#define F_v v5 -#define G_v v6 -#define H_v v7 -#define I_v v16 -#define J_v v17 - -#ifndef MEMMOVE -# define MEMMOVE memmove -#endif -#ifndef MEMCPY -# define MEMCPY memcpy -#endif - -#if IS_IN (libc) - -#undef MEMCPY -#define MEMCPY __memcpy_kunpeng -#undef MEMMOVE -#define MEMMOVE __memmove_kunpeng - - -/* Overlapping large forward memmoves use a loop that copies backwards. - Otherwise memcpy is used. Small moves branch to memcopy16 directly. - The longer memcpy cases fall through to the memcpy head. -*/ - -ENTRY_ALIGN (MEMMOVE, 6) - - DELOUSE (0) - DELOUSE (1) - DELOUSE (2) - - sub tmp1, dstin, src - cmp count, 512 - ccmp tmp1, count, 2, hi - b.lo L(move_long) - cmp count, 96 - ccmp tmp1, count, 2, hi - b.lo L(move_middle) - -END (MEMMOVE) -libc_hidden_builtin_def (MEMMOVE) - - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use load-and-merge - approach in the case src and dst addresses are unaligned not evenly, - so that, actual loads and stores are always aligned. - Large copies use the loops processing 64 bytes per iteration for - unaligned case and 128 bytes per iteration for aligned ones. -*/ - -#define MEMCPY_PREFETCH_LDR 640 - - .p2align 4 -ENTRY (MEMCPY) - - DELOUSE (0) - DELOUSE (1) - DELOUSE (2) - - add srcend, src, count - cmp count, 16 - b.ls L(memcopy16) - add dstend, dstin, count - cmp count, 96 - b.hi L(memcopy_long) - - /* Medium copies: 17..96 bytes. */ - ldr A_q, [src], #16 - and tmp1, src, 15 - ldr E_q, [srcend, -16] - cmp count, 64 - b.gt L(memcpy_copy96) - cmp count, 48 - b.le L(bytes_17_to_48) - /* 49..64 bytes */ - ldp B_q, C_q, [src] - str E_q, [dstend, -16] - stp A_q, B_q, [dstin] - str C_q, [dstin, 32] - ret - -L(bytes_17_to_48): - /* 17..48 bytes*/ - cmp count, 32 - b.gt L(bytes_32_to_48) - /* 17..32 bytes*/ - str A_q, [dstin] - str E_q, [dstend, -16] - ret - -L(bytes_32_to_48): - /* 32..48 */ - ldr B_q, [src] - str A_q, [dstin] - str E_q, [dstend, -16] - str B_q, [dstin, 16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(memcopy16): - cmp count, 8 - b.lo L(bytes_0_to_8) - ldr A_l, [src] - ldr A_h, [srcend, -8] - add dstend, dstin, count - str A_l, [dstin] - str A_h, [dstend, -8] - ret - .p2align 4 - -L(bytes_0_to_8): - tbz count, 2, L(bytes_0_to_3) - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - add dstend, dstin, count - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -L(bytes_0_to_3): - cbz count, 1f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - add dstend, dstin, count - ldrb B_lw, [src, tmp1] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] - strb A_lw, [dstin] -1: - ret - - .p2align 4 - -L(memcpy_copy96): - /* Copying 65..96 bytes. A_q (first 16 bytes) and - E_q(last 16 bytes) are already loaded. The size - is large enough to benefit from aligned loads */ - bic src, src, 15 - ldp B_q, C_q, [src] - /* Loaded 64 bytes, second 16-bytes chunk can be - overlapping with the first chunk by tmp1 bytes. - Stored 16 bytes. */ - sub dst, dstin, tmp1 - add count, count, tmp1 - /* The range of count being [65..96] becomes [65..111] - after tmp [0..15] gets added to it, - count now is +48 */ - cmp count, 80 - b.gt L(copy96_medium) - ldr D_q, [src, 32] - stp B_q, C_q, [dst, 16] - str D_q, [dst, 48] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - - .p2align 4 -L(copy96_medium): - ldp D_q, G_q, [src, 32] - cmp count, 96 - b.gt L(copy96_large) - stp B_q, C_q, [dst, 16] - stp D_q, G_q, [dst, 48] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - -L(copy96_large): - ldr F_q, [src, 64] - str B_q, [dst, 16] - stp C_q, D_q, [dst, 32] - stp G_q, F_q, [dst, 64] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - - .p2align 4 -L(memcopy_long): - cmp count, 2048 - b.ls L(copy2048_large) - ldr A_q, [src], #16 - and tmp1, src, 15 - bic src, src, 15 - ldp B_q, C_q, [src], #32 - sub dst, dstin, tmp1 - add count, count, tmp1 - add dst, dst, 16 - and tmp1, dst, 15 - ldp D_q, E_q, [src], #32 - str A_q, [dstin] - - /* Already loaded 64+16 bytes. Check if at - least 64 more bytes left */ - subs count, count, 64+64+16 - b.lt L(loop128_exit0) - cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 - b.lt L(loop128) - cbnz tmp1, L(dst_unaligned) - sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 - - .p2align 4 - -L(loop128_prefetch): - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] - ldp F_q, G_q, [src], #32 - stp B_q, C_q, [dst], #32 - ldp H_q, I_q, [src], #32 - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] - ldp B_q, C_q, [src], #32 - stp D_q, E_q, [dst], #32 - ldp D_q, E_q, [src], #32 - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst], #32 - subs count, count, 128 - b.ge L(loop128_prefetch) - - add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 - .p2align 4 -L(loop128): - ldp F_q, G_q, [src], #32 - ldp H_q, I_q, [src], #32 - stp B_q, C_q, [dst], #32 - stp D_q, E_q, [dst], #32 - subs count, count, 64 - b.lt L(loop128_exit1) - ldp B_q, C_q, [src], #32 - ldp D_q, E_q, [src], #32 - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst], #32 - subs count, count, 64 - b.ge L(loop128) -L(loop128_exit0): - ldp F_q, G_q, [srcend, -64] - ldp H_q, I_q, [srcend, -32] - stp B_q, C_q, [dst], #32 - stp D_q, E_q, [dst] - stp F_q, G_q, [dstend, -64] - stp H_q, I_q, [dstend, -32] - ret -L(loop128_exit1): - ldp B_q, C_q, [srcend, -64] - ldp D_q, E_q, [srcend, -32] - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst] - stp B_q, C_q, [dstend, -64] - stp D_q, E_q, [dstend, -32] - ret - -L(copy2048_large): - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(last64) - -L(loop64): - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64] - ldp D_l, D_h, [src, 64] - add dst, dst, 64 - add src, src, 64 - subs count, count, 64 - b.hi L(loop64) - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -L(last64): - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] - ret - - -L(dst_unaligned_tail): - ldp C_q, D_q, [srcend, -64] - ldp E_q, F_q, [srcend, -32] - stp A_q, B_q, [dst], #32 - stp H_q, I_q, [dst], #16 - str G_q, [dst, tmp1] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstend, -32] - ret - -L(dst_unaligned): - /* For the unaligned store case the code loads two - aligned chunks and then merges them using ext - instruction. This can be up to 30% faster than - the the simple unaligned store access. - - Current state: tmp1 = dst % 16; C_q, D_q, E_q - contains data yet to be stored. src and dst points - to next-to-be-processed data. A_q, B_q contains - data already stored before, count = bytes left to - be load decremented by 64. - - The control is passed here if at least 64 bytes left - to be loaded. The code does two aligned loads and then - extracts (16-tmp1) bytes from the first register and - tmp1 bytes from the next register forming the value - for the aligned store. - - As ext instruction can only have it's index encoded - as immediate. 15 code chunks process each possible - index value. Computed goto is used to reach the - required code. */ - - /* Store the 16 bytes to dst and align dst for further - operations, several bytes will be stored at this - address once more */ - - ldp F_q, G_q, [src], #32 - stp B_q, C_q, [dst], #32 - bic dst, dst, 15 - sub count, count, 32 - adrp tmp2, L(ext_table) - add tmp2, tmp2, :lo12:L(ext_table) - add tmp2, tmp2, tmp1, LSL #2 - ldr tmp3w, [tmp2] - add tmp2, tmp2, tmp3w, SXTW - br tmp2 - -.p2align 4 - /* to make the loop in each chunk 16-bytes aligned */ - nop -#define EXT_CHUNK(shft) \ -L(ext_size_ ## shft):;\ - ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ - ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ - ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ -1:;\ - stp A_q, B_q, [dst], #32;\ - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ - ldp C_q, D_q, [src], #32;\ - ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ - stp H_q, I_q, [dst], #32;\ - ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\ - ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\ - ldp F_q, G_q, [src], #32;\ - ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\ - subs count, count, 64;\ - b.ge 1b;\ -2:;\ - ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ - b L(dst_unaligned_tail); - -EXT_CHUNK(1) -EXT_CHUNK(2) -EXT_CHUNK(3) -EXT_CHUNK(4) -EXT_CHUNK(5) -EXT_CHUNK(6) -EXT_CHUNK(7) -EXT_CHUNK(8) -EXT_CHUNK(9) -EXT_CHUNK(10) -EXT_CHUNK(11) -EXT_CHUNK(12) -EXT_CHUNK(13) -EXT_CHUNK(14) -EXT_CHUNK(15) - -.p2align 4 -L(move_long): -1: - add srcend, src, count - add dstend, dstin, count - - and tmp1, dstend, 15 - ldr D_q, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_q, B_q, [srcend, -32] - str D_q, [dstend, -16] - ldp C_q, D_q, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - -.p2align 4 -1: - subs count, count, 64 - stp A_q, B_q, [dstend, -32] - ldp A_q, B_q, [srcend, -32] - stp C_q, D_q, [dstend, -64]! - ldp C_q, D_q, [srcend, -64]! - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp E_q, F_q, [src, 32] - ldp G_q, H_q, [src] - stp A_q, B_q, [dstend, -32] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstin, 32] - stp G_q, H_q, [dstin] -3: ret - - -.p2align 4 -L(move_middle): - cbz tmp1, 3f - add srcend, src, count - prfm PLDL1STRM, [srcend, -64] - add dstend, dstin, count - and tmp1, dstend, 15 - ldr D_q, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldr A_q, [srcend, -16] - str D_q, [dstend, -16] - ldr B_q, [srcend, -32] - ldr C_q, [srcend, -48] - ldr D_q, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - -1: - str A_q, [dstend, -16] - ldr A_q, [srcend, -16] - str B_q, [dstend, -32] - ldr B_q, [srcend, -32] - str C_q, [dstend, -48] - ldr C_q, [srcend, -48] - str D_q, [dstend, -64]! - ldr D_q, [srcend, -64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldr G_q, [src, 48] - str A_q, [dstend, -16] - ldr A_q, [src, 32] - str B_q, [dstend, -32] - ldr B_q, [src, 16] - str C_q, [dstend, -48] - ldr C_q, [src] - str D_q, [dstend, -64] - str G_q, [dstin, 48] - str A_q, [dstin, 32] - str B_q, [dstin, 16] - str C_q, [dstin] -3: ret - - -END (MEMCPY) - .section .rodata - .p2align 4 - -L(ext_table): - /* The first entry is for the alignment of 0 and is never - actually used (could be any value). */ - .word 0 - .word L(ext_size_1) -. - .word L(ext_size_2) -. - .word L(ext_size_3) -. - .word L(ext_size_4) -. - .word L(ext_size_5) -. - .word L(ext_size_6) -. - .word L(ext_size_7) -. - .word L(ext_size_8) -. - .word L(ext_size_9) -. - .word L(ext_size_10) -. - .word L(ext_size_11) -. - .word L(ext_size_12) -. - .word L(ext_size_13) -. - .word L(ext_size_14) -. - .word L(ext_size_15) -. - -libc_hidden_builtin_def (MEMCPY) -#endif diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index 0d8c85b4..e69d8162 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -1,5 +1,5 @@ /* Multiple versions of memmove. AARCH64 version. - Copyright (C) 2017-2019 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,16 +31,13 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_kunpeng attribute_hidden; - + libc_ifunc (__libc_memmove, - (IS_KUNPENG920(midr) - ?__memmove_kunpeng - :(IS_THUNDERX (midr) + (IS_THUNDERX (midr) ? __memmove_thunderx : (IS_FALKOR (midr) || IS_PHECDA (midr) ? __memmove_falkor - : __memmove_generic)))); + : __memmove_generic))); # undef memmove strong_alias (__libc_memmove, memmove); diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c index 0f7ad0c8..f7ae291e 100644 --- a/sysdeps/aarch64/multiarch/memset.c +++ b/sysdeps/aarch64/multiarch/memset.c @@ -1,5 +1,5 @@ /* Multiple versions of memset. AARCH64 version. - Copyright (C) 2017-2019 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -29,15 +29,15 @@ extern __typeof (__redirect_memset) __libc_memset; extern __typeof (__redirect_memset) __memset_falkor attribute_hidden; -extern __typeof (__redirect_memset) __memset_generic attribute_hidden; extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; +extern __typeof (__redirect_memset) __memset_generic attribute_hidden; libc_ifunc (__libc_memset, - IS_KUNPENG920(midr) - ?__memset_kunpeng - :((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64 - ?__memset_falkor - :__memset_generic)); + IS_KUNPENG920 (midr) + ?__memset_kunpeng + : ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64 + ? __memset_falkor + : __memset_generic)); # undef memset strong_alias (__libc_memset, memset); diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S index 22a3d4a7..a03441ae 100644 --- a/sysdeps/aarch64/multiarch/memset_kunpeng.S +++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2019 Free Software Foundation, Inc. +/* Optimized memset for Huawei Kunpeng processor. + Copyright (C) 2012-2019 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -14,7 +15,7 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see - . */ + . */ #include #include @@ -35,7 +36,7 @@ ENTRY_ALIGN (MEMSET, 6) dup v0.16B, valw add dstend, dstin, count - + cmp count, 128 b.hs L(set_long) @@ -44,7 +45,7 @@ ENTRY_ALIGN (MEMSET, 6) /* Set 16..127 bytes. */ str q0, [dstin] - tbnz count, 6, L(set112) + tbnz count, 6, L(set127) str q0, [dstend, -16] tbz count, 5, 1f str q0, [dstin, 16] @@ -53,26 +54,14 @@ ENTRY_ALIGN (MEMSET, 6) .p2align 4 /* Set 64..127 bytes. Write 64 bytes from the start and - 32 bytes from the end. */ -L(set112): - ands tmp1, dstin, 15 - bne 2f - str q0, [dstin, 16] - stp q0, q0, [dstin, 32]//finish 64 - tbz count, 5, 1f - stp q0, q0, [dstin, 64] //大于96, finish 96 -1: stp q0, q0, [dstend, -32] + 64 bytes from the end. */ +L(set127): + stp q0, q0, [dstin, 16] + str q0, [dstin, 48] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ret - .p2align 4 -2: bic dst, dstin, 15//回退到16对齐 - stp q0,q0, [dst, 16] - str q0, [dst, 48] - tbz count, 5, 3f //大于96 - stp q0, q0, [dst, 64] -3: stp q0, q0, [dstend, -48]//finish 64~80 - str q0, [dstend, -16]//finish 96 - ret - + .p2align 4 /* Set 0..15 bytes. */ L(less16): @@ -90,10 +79,9 @@ L(less8): tbz count, 1, 3f str h0, [dstend, -2] 3: ret - + .p2align 4 -L(set_long): - and valw, valw, 255 +L(set_long): bic dst, dstin, 15 str q0, [dstin] sub count, dstend, dst /* Count is 16 too large. */ @@ -103,19 +91,21 @@ L(set_long): stp q0, q0, [dst, 64]! subs count, count, 64 b.lo 1f - stp q0, q0, [dst, 32] + stp q0, q0, [dst, 32] stp q0, q0, [dst, 64]! subs count, count, 64 b.lo 1f - stp q0, q0, [dst, 32] + stp q0, q0, [dst, 32] stp q0, q0, [dst, 64]! subs count, count, 64 - b.hs 1b - -1: tbz count, 5, 2f - str q0, [dst, 32] - str q0, [dst, 48] -2: stp q0, q0, [dstend, -32] + b.lo 1f + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hs 1b + +1: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ret END (MEMSET) diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S index 290bcf8d..a64c5980 100644 --- a/sysdeps/aarch64/strcpy.S +++ b/sysdeps/aarch64/strcpy.S @@ -1,5 +1,5 @@ /* strcpy/stpcpy - copy a string returning pointer to start/end. - Copyright (C) 2013-2019 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -14,7 +14,7 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see - . */ + . */ /* To build as stpcpy, define BUILD_STPCPY before compiling this file. @@ -232,7 +232,7 @@ L(entry_no_page_cross): #ifdef __AARCH64EB__ rev64 datav.16b, datav.16b #endif - /* loc */ + /* calculate the loc value */ cmeq datav.16b, datav.16b, #0 mov data1, datav.d[0] mov data2, datav.d[1] diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S index a57753b0..0a42f404 100644 --- a/sysdeps/aarch64/strnlen.S +++ b/sysdeps/aarch64/strnlen.S @@ -1,6 +1,6 @@ /* strnlen - calculate the length of a string with limit. - Copyright (C) 2013-2019 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,7 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see - . */ + . */ #include diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index b152c4e3..e60485b0 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -1,6 +1,6 @@ /* Initialize CPU feature data. AArch64 version. This file is part of the GNU C Library. - Copyright (C) 2017-2019 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -36,7 +36,7 @@ static struct cpu_list cpu_list[] = { {"thunderx2t99", 0x431F0AF0}, {"thunderx2t99p1", 0x420F5160}, {"phecda", 0x680F0000}, - {"kunpeng920", 0x481FD010}, + {"kunpeng920", 0x481FD010}, {"generic", 0x0} }; diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index 4faeed7a..ed77cde7 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -1,6 +1,6 @@ /* Initialize CPU feature data. AArch64 version. This file is part of the GNU C Library. - Copyright (C) 2017-2019 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -51,8 +51,9 @@ #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h' \ && MIDR_PARTNUM(midr) == 0x000) -#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H' \ - && MIDR_PARTNUM(midr) == 0xd01) + +#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H' \ + && MIDR_PARTNUM(midr) == 0xd01) struct cpu_features { -- 2.19.1