From 8045463341b2495da7b2e7dc308a023764315bbe Mon Sep 17 00:00:00 2001 From: swcompiler Date: Fri, 29 Nov 2024 14:15:45 +0800 Subject: [PATCH 11/23] Sw64: Integer Operation Support --- sysdeps/sw_64/add_n.S | 118 +++++++++ sysdeps/sw_64/addmul_1.S | 89 +++++++ sysdeps/sw_64/bzero.S | 107 ++++++++ sysdeps/sw_64/div.S | 83 ++++++ sysdeps/sw_64/div_libc.h | 170 ++++++++++++ sysdeps/sw_64/divl.S | 96 +++++++ sysdeps/sw_64/divlu.S | 4 + sysdeps/sw_64/divq.S | 290 +++++++++++++++++++++ sysdeps/sw_64/divqu.S | 292 +++++++++++++++++++++ sysdeps/sw_64/htonl.S | 43 +++ sysdeps/sw_64/htons.S | 39 +++ sysdeps/sw_64/ldiv.S | 222 ++++++++++++++++ sysdeps/sw_64/lldiv.S | 1 + sysdeps/sw_64/lshift.S | 107 ++++++++ sysdeps/sw_64/mul_1.S | 82 ++++++ sysdeps/sw_64/reml.S | 93 +++++++ sysdeps/sw_64/remlu.S | 4 + sysdeps/sw_64/remq.S | 274 ++++++++++++++++++++ sysdeps/sw_64/remqu.S | 292 +++++++++++++++++++++ sysdeps/sw_64/rshift.S | 105 ++++++++ sysdeps/sw_64/sub_n.S | 118 +++++++++ sysdeps/sw_64/submul_1.S | 89 +++++++ sysdeps/sw_64/sw6a/add_n.S | 146 +++++++++++ sysdeps/sw_64/sw6a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++ sysdeps/sw_64/sw6a/lshift.S | 172 ++++++++++++ sysdeps/sw_64/sw6a/rshift.S | 170 ++++++++++++ sysdeps/sw_64/sw6a/sub_n.S | 147 +++++++++++ sysdeps/sw_64/sw6b/add_n.S | 146 +++++++++++ sysdeps/sw_64/sw6b/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++ sysdeps/sw_64/sw6b/lshift.S | 172 ++++++++++++ sysdeps/sw_64/sw6b/memcpy.S | 416 +++++++++++++++++++++++++++++ sysdeps/sw_64/sw6b/memset.S | 312 ++++++++++++++++++++++ sysdeps/sw_64/sw6b/rshift.S | 170 ++++++++++++ sysdeps/sw_64/sw6b/stxcpy.S | 314 ++++++++++++++++++++++ sysdeps/sw_64/sw6b/stxncpy.S | 392 ++++++++++++++++++++++++++++ sysdeps/sw_64/sw6b/sub_n.S | 147 +++++++++++ sysdeps/sw_64/sw8a/add_n.S | 146 +++++++++++ sysdeps/sw_64/sw8a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++ sysdeps/sw_64/sw8a/lshift.S | 172 ++++++++++++ sysdeps/sw_64/sw8a/rshift.S | 170 ++++++++++++ sysdeps/sw_64/sw8a/sub_n.S | 147 +++++++++++ sysdeps/sw_64/udiv_qrnnd.S | 159 ++++++++++++ 42 files changed, 7641 insertions(+) create mode 100644 sysdeps/sw_64/add_n.S create mode 100644 sysdeps/sw_64/addmul_1.S create mode 100644 sysdeps/sw_64/bzero.S create mode 100644 sysdeps/sw_64/div.S create mode 100644 sysdeps/sw_64/div_libc.h create mode 100644 sysdeps/sw_64/divl.S create mode 100644 sysdeps/sw_64/divlu.S create mode 100644 sysdeps/sw_64/divq.S create mode 100644 sysdeps/sw_64/divqu.S create mode 100644 sysdeps/sw_64/htonl.S create mode 100644 sysdeps/sw_64/htons.S create mode 100644 sysdeps/sw_64/ldiv.S create mode 100644 sysdeps/sw_64/lldiv.S create mode 100644 sysdeps/sw_64/lshift.S create mode 100644 sysdeps/sw_64/mul_1.S create mode 100644 sysdeps/sw_64/reml.S create mode 100644 sysdeps/sw_64/remlu.S create mode 100644 sysdeps/sw_64/remq.S create mode 100644 sysdeps/sw_64/remqu.S create mode 100644 sysdeps/sw_64/rshift.S create mode 100644 sysdeps/sw_64/sub_n.S create mode 100644 sysdeps/sw_64/submul_1.S create mode 100644 sysdeps/sw_64/sw6a/add_n.S create mode 100644 sysdeps/sw_64/sw6a/addmul_1.S create mode 100644 sysdeps/sw_64/sw6a/lshift.S create mode 100644 sysdeps/sw_64/sw6a/rshift.S create mode 100644 sysdeps/sw_64/sw6a/sub_n.S create mode 100644 sysdeps/sw_64/sw6b/add_n.S create mode 100644 sysdeps/sw_64/sw6b/addmul_1.S create mode 100644 sysdeps/sw_64/sw6b/lshift.S create mode 100644 sysdeps/sw_64/sw6b/memcpy.S create mode 100644 sysdeps/sw_64/sw6b/memset.S create mode 100644 sysdeps/sw_64/sw6b/rshift.S create mode 100644 sysdeps/sw_64/sw6b/stxcpy.S create mode 100644 sysdeps/sw_64/sw6b/stxncpy.S create mode 100644 sysdeps/sw_64/sw6b/sub_n.S create mode 100644 sysdeps/sw_64/sw8a/add_n.S create mode 100644 sysdeps/sw_64/sw8a/addmul_1.S create mode 100644 sysdeps/sw_64/sw8a/lshift.S create mode 100644 sysdeps/sw_64/sw8a/rshift.S create mode 100644 sysdeps/sw_64/sw8a/sub_n.S create mode 100644 sysdeps/sw_64/udiv_qrnnd.S diff --git a/sysdeps/sw_64/add_n.S b/sysdeps/sw_64/add_n.S new file mode 100644 index 00000000..8c5c8c08 --- /dev/null +++ b/sysdeps/sw_64/add_n.S @@ -0,0 +1,118 @@ + # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + ldl $3,0($17) + ldl $4,0($18) + + subl $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop + + subl $19,$2,$19 + +.Loop0: subl $2,1,$2 + ldl $5,8($17) + addl $4,$0,$4 + ldl $6,8($18) + cmpult $4,$0,$1 + addl $3,$4,$4 + cmpult $4,$3,$0 + stl $4,0($16) + or $0,$1,$0 + + addl $17,8,$17 + addl $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addl $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subl $19,4,$19 + + ldl $5,8($17) + addl $4,$0,$4 + ldl $6,8($18) + cmpult $4,$0,$1 + addl $3,$4,$4 + cmpult $4,$3,$0 + stl $4,0($16) + or $0,$1,$0 + + ldl $3,16($17) + addl $6,$0,$6 + ldl $4,16($18) + cmpult $6,$0,$1 + addl $5,$6,$6 + cmpult $6,$5,$0 + stl $6,8($16) + or $0,$1,$0 + + ldl $5,24($17) + addl $4,$0,$4 + ldl $6,24($18) + cmpult $4,$0,$1 + addl $3,$4,$4 + cmpult $4,$3,$0 + stl $4,16($16) + or $0,$1,$0 + + ldl $3,32($17) + addl $6,$0,$6 + ldl $4,32($18) + cmpult $6,$0,$1 + addl $5,$6,$6 + cmpult $6,$5,$0 + stl $6,24($16) + or $0,$1,$0 + + addl $17,32,$17 + addl $18,32,$18 + addl $16,32,$16 + bne $19,.Loop + +.Lend: addl $4,$0,$4 + cmpult $4,$0,$1 + addl $3,$4,$4 + cmpult $4,$3,$0 + stl $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_add_n diff --git a/sysdeps/sw_64/addmul_1.S b/sysdeps/sw_64/addmul_1.S new file mode 100644 index 00000000..138e3c69 --- /dev/null +++ b/sysdeps/sw_64/addmul_1.S @@ -0,0 +1,89 @@ + # Sw_64 1621 __mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + + # Copyright (C) 1992-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 2 +__mpn_addmul_1: + .frame $30,0,$26 + + ldl $2,0($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + subl $18,1,$18 # size-- + mull $2,$19,$3 # $3 = prod_low + ldl $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,.Lend1 # jump if size was == 1 + ldl $2,0($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + subl $18,1,$18 # size-- + addl $5,$3,$3 + cmpult $3,$5,$4 + stl $3,0($16) + addl $16,8,$16 # res_ptr++ + beq $18,.Lend2 # jump if size was == 2 + + .align 3 +.Loop: mull $2,$19,$3 # $3 = prod_low + ldl $5,0($16) # $5 = *res_ptr + addl $4,$0,$0 # cy_limb = cy_limb + 'cy' + subl $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldl $2,0($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + addl $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addl $5,$3,$3 + cmpult $3,$5,$5 + stl $3,0($16) + addl $16,8,$16 # res_ptr++ + addl $5,$0,$0 # combine carries + bne $18,.Loop + +.Lend2: mull $2,$19,$3 # $3 = prod_low + ldl $5,0($16) # $5 = *res_ptr + addl $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addl $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addl $5,$3,$3 + cmpult $3,$5,$5 + stl $3,0($16) + addl $5,$0,$0 # combine carries + addl $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +.Lend1: addl $5,$3,$3 + cmpult $3,$5,$5 + stl $3,0($16) + addl $0,$5,$0 + ret $31,($26),1 + + .end __mpn_addmul_1 diff --git a/sysdeps/sw_64/bzero.S b/sysdeps/sw_64/bzero.S new file mode 100644 index 00000000..1a020afd --- /dev/null +++ b/sysdeps/sw_64/bzero.S @@ -0,0 +1,107 @@ +/* Copyright (C) 1996-2023 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* Fill a block of memory with zeros. Optimized for the Sw_64 architecture: + + - memory accessed as aligned quadwords only + - destination memory not read unless needed for good cache behaviour + - basic blocks arranged to optimize branch prediction for full-quadword + aligned memory blocks. + - partial head and tail quadwords constructed with byte-mask instructions + +*/ + + +#include + + .set noat + .set noreorder + + .text + .type __bzero, @function + .globl __bzero + .usepv __bzero, USEPV_PROF + + cfi_startproc + + /* On entry to this basic block: + t3 == loop counter + t4 == bytes in partial final word + a0 == possibly misaligned destination pointer */ + + .align 3 +bzero_loop: + beq t3, $tail # + blbc t3, 0f # skip single store if count even + + stl_u zero, 0(a0) # e0 : store one word + subl t3, 1, t3 # .. e1 : + addl a0, 8, a0 # e0 : + beq t3, $tail # .. e1 : + +0: stl_u zero, 0(a0) # e0 : store two words + subl t3, 2, t3 # .. e1 : + stl_u zero, 8(a0) # e0 : + addl a0, 16, a0 # .. e1 : + bne t3, 0b # e1 : + +$tail: bne t4, 1f # is there a tail to do? + ret # no + +1: ldl_u t0, 0(a0) # yes, load original data + mask7b t0, t4, t0 # + stl_u t0, 0(a0) # + ret # + +__bzero: +#ifdef PROF + ldgp gp, 0(pv) + ldi AT, _mcount + call AT, (AT), _mcount +#endif + + mov a0, v0 # e0 : move return value in place + beq a1, $done # .. e1 : early exit for zero-length store + and a0, 7, t1 # e0 : + addl a1, t1, a1 # e1 : add dest misalignment to count + srl a1, 3, t3 # e0 : loop = count >> 3 + and a1, 7, t4 # .. e1 : find number of bytes in tail + unop # : + beq t1, bzero_loop # e1 : aligned head, jump right in + + ldl_u t0, 0(a0) # e0 : load original data to mask into + cmpult a1, 8, t2 # .. e1 : is this a sub-word set + bne t2, $oneq # e1 : + + mask3b t0, a0, t0 # e0 : we span words. finish this partial + subl t3, 1, t3 # e0 : + addl a0, 8, a0 # .. e1 : + stl_u t0, -8(a0) # e0 : + br bzero_loop # .. e1 : + + .align 3 +$oneq: + mask3b t0, a0, t2 # e0 : + mask7b t0, a1, t3 # e0 : + or t2, t3, t0 # e1 : + stl_u t0, 0(a0) # e0 : + +$done: ret + + cfi_endproc +weak_alias (__bzero, bzero) diff --git a/sysdeps/sw_64/div.S b/sysdeps/sw_64/div.S new file mode 100644 index 00000000..6dbdcb7f --- /dev/null +++ b/sysdeps/sw_64/div.S @@ -0,0 +1,83 @@ +/* Copyright (C) 1996-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Richard Henderson . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + +#undef FRAME +#ifdef __sw_64_fix__ +#define FRAME 0 +#else +#define FRAME 16 +#endif + + .set noat + + .align 4 + .globl div + .ent div +div: + .frame sp, FRAME, ra +#if FRAME > 0 + ldi sp, -FRAME(sp) +#endif +#ifdef PROF + .set macro + ldgp gp, 0(pv) + ldi AT, _mcount + call AT, (AT), _mcount + .set nomacro + .prologue 1 +#else + .prologue 0 +#endif + + beq $18, $divbyzero + rfpcr $f10 + _ITOFT2 $17, $f0, 0, $18, $f1, 8 + fcvtld $f0, $f11 + fcvtld $f1, $f12 + fdivd $f11, $f12, $f1 + fcvtdl_z $f1, $f0 + wfpcr $f10 + _FTOIT $f0, $0, 0 + + mulw $0, $18, $1 + subw $17, $1, $1 + + stw $0, 0(a0) + stw $1, 4(a0) + mov a0, v0 + +#if FRAME > 0 + ldi sp, FRAME(sp) +#endif + ret + +$divbyzero: + mov a0, v0 + ldi a0, GEN_INTDIV + sys_call HMC_gentrap + stw zero, 0(v0) + stw zero, 4(v0) + +#if FRAME > 0 + ldi sp, FRAME(sp) +#endif + ret + + .end div diff --git a/sysdeps/sw_64/div_libc.h b/sysdeps/sw_64/div_libc.h new file mode 100644 index 00000000..2066924b --- /dev/null +++ b/sysdeps/sw_64/div_libc.h @@ -0,0 +1,170 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* Common bits for implementing software divide. */ + +#include +#ifdef __linux__ +# include +# include +#else +# include +#endif + +/* These are not normal C functions. Argument registers are t10 and t11; + the result goes in t12; the return address is in t9. Only t12 and AT + may be clobbered. */ +#define X t10 +#define Y t11 +#define RV t12 +#define RA t9 + +/* The secureplt format does not allow the division routines to be called + via plt; there aren't enough registers free to be clobbered. Avoid + setting the symbol type to STT_FUNC, so that the linker won't be tempted + to create a plt entry. */ +#define funcnoplt notype + +/* None of these functions should use implicit anything. */ + .set nomacro + .set noat + +/* Code fragment to invoke _mcount for profiling. This should be invoked + directly after allocation of the stack frame. */ +.macro CALL_MCOUNT +#ifdef PROF + stl ra, 0(sp) + stl pv, 8(sp) + stl gp, 16(sp) + cfi_rel_offset (ra, 0) + cfi_rel_offset (pv, 8) + cfi_rel_offset (gp, 16) + br AT, 1f + .set macro +1: ldgp gp, 0(AT) + mov RA, ra + ldi AT, _mcount + call AT, (AT), _mcount + .set nomacro + ldl ra, 0(sp) + ldl pv, 8(sp) + ldl gp, 16(sp) + cfi_restore (ra) + cfi_restore (pv) + cfi_restore (gp) + /* Realign subsequent code with what we'd have without this + macro at all. This means aligned with one arithmetic insn + used within the bundle. */ + .align 4 + nop +#endif +.endm + +/* In order to make the below work, all top-level divide routines must + use the same frame size. */ +#define FRAME 96 + +/* Code fragment to generate an integer divide-by-zero fault. When + building libc.so, we arrange for there to be one copy of this code + placed late in the dso, such that all branches are forward. When + building libc.a, we use multiple copies to avoid having an out of + range branch. Users should jump to DIVBYZERO. */ + +.macro DO_DIVBYZERO +#ifdef PIC +#define DIVBYZERO __divbyzero + .section .gnu.linkonce.t.divbyzero, "ax", @progbits + .globl __divbyzero + .type __divbyzero, @function + .usepv __divbyzero, no + .hidden __divbyzero +#else +#define DIVBYZERO $divbyzero +#endif + + .align 4 +DIVBYZERO: + cfi_startproc + cfi_return_column (RA) + cfi_def_cfa_offset (FRAME) + + mov a0, RV + unop + ldi a0, GEN_INTDIV + sys_call HMC_gentrap + + mov RV, a0 + clr RV + ldi sp, FRAME(sp) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + cfi_endproc + .size DIVBYZERO, .-DIVBYZERO +.endm + +/* Like the sw6a instructions, but fall back to stack use on prior machines. */ +#ifdef __sw_64_sw6a__ + .arch sw6a +#endif +#ifdef __sw_64_sw6b__ + .arch sw6b +#endif +#ifdef __sw_64_sw8a__ + .arch sw8a +#endif + +.macro _ITOFS gr, fr, slot +#ifdef __sw_64_fix__ + ifmovs \gr, \fr +#else + stw \gr, \slot(sp) + flds \fr, \slot(sp) +#endif +.endm + +.macro _ITOFT gr, fr, slot +#ifdef __sw_64_fix__ + ifmovd \gr, \fr +#else + stl \gr, \slot(sp) + fldd \fr, \slot(sp) +#endif +.endm + +.macro _FTOIT fr, gr, slot +#ifdef __sw_64_fix__ + fimovd \fr, \gr +#else + fstd \fr, \slot(sp) + ldl \gr, \slot(sp) +#endif +.endm + +/* Similarly, but move two registers. Schedules better for pre-sw6a. */ + +.macro _ITOFT2 gr1, fr1, slot1, gr2, fr2, slot2 +#ifdef __sw_64_fix__ + ifmovd \gr1, \fr1 + ifmovd \gr2, \fr2 +#else + stl \gr1, \slot1(sp) + stl \gr2, \slot2(sp) + fldd \fr1, \slot1(sp) + fldd \fr2, \slot2(sp) +#endif +.endm diff --git a/sysdeps/sw_64/divl.S b/sysdeps/sw_64/divl.S new file mode 100644 index 00000000..1192a0aa --- /dev/null +++ b/sysdeps/sw_64/divl.S @@ -0,0 +1,96 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + +/* 32-bit signed int divide. This is not a normal C function. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + The FPU can handle all input values except zero. Whee! + + The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE + for cvttq/c even without /sui being set. It will not, however, properly + raise the exception, so we don't have to worry about FPCR_INED being clear + and so dying by SIGFPE. */ + + /***************************************************************** + # * + # transform to sw-instruct on 2016111216 * + # * + #****************************************************************/ + +#ifndef EXTEND +#define EXTEND(S,D) sextl S, D +#endif + + .text + .align 4 + .globl __divw + .type __divw, @funcnoplt + .usepv __divw, no + + cfi_startproc + cfi_return_column (RA) +__divw: + ldi sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + fstd $f0, 0(sp) + excb + beq Y, DIVBYZERO + + fstd $f1, 8(sp) + fstd $f2, 16(sp) + fstd $f3, 40(sp) + fstd $f4, 48(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + cfi_rel_offset ($f2, 16) + cfi_rel_offset ($f3, 40) + cfi_rel_offset ($f4, 48) + + rfpcr $f2 + EXTEND (X, RV) + EXTEND (Y, AT) + _ITOFT2 RV, $f0, 24, AT, $f1, 32 + fcvtld $f0, $f3 + fcvtld $f1, $f4 + fdivd $f3, $f4, $f1 + fcvtdl_z $f1, $f0 + wfpcr $f2 + _FTOIT $f0, RV, 24 + + fldd $f0, 0(sp) + fldd $f1, 8(sp) + fldd $f2, 16(sp) + fldd $f3, 40(sp) + fldd $f4, 48(sp) + ldi sp, FRAME(sp) + cfi_restore ($f0) + cfi_restore ($f1) + cfi_restore ($f2) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_def_cfa_offset (0) + sextl RV, RV + ret $31, (RA), 1 + + cfi_endproc + .size __divw, .-__divw + + DO_DIVBYZERO diff --git a/sysdeps/sw_64/divlu.S b/sysdeps/sw_64/divlu.S new file mode 100644 index 00000000..26e1842f --- /dev/null +++ b/sysdeps/sw_64/divlu.S @@ -0,0 +1,4 @@ +#define UNSIGNED +#define EXTEND(S,D) zapnot S, 15, D +#define __divw __divwu +#include diff --git a/sysdeps/sw_64/divq.S b/sysdeps/sw_64/divq.S new file mode 100644 index 00000000..61ef58b4 --- /dev/null +++ b/sysdeps/sw_64/divq.S @@ -0,0 +1,290 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + + +/* 64-bit signed long divide. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. + + The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE + for cvttq/c even without /sui being set. It will not, however, properly + raise the exception, so we don't have to worry about FPCR_INED being clear + and so dying by SIGFPE. */ + /***************************************************************** + # * + # transform to sw-instruct on 2016111216 * + # * + #****************************************************************/ + .text + .align 4 + .globl __divl + .type __divl, @funcnoplt + .usepv __divl, no + + cfi_startproc + cfi_return_column (RA) +__divl: + ldi sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + fstd $f0, 0(sp) + excb + beq Y, DIVBYZERO + + fstd $f1, 8(sp) + fstd $f3, 48(sp) + fstd $f4, 56(sp) + fstd $f5, 64(sp) + + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + cfi_rel_offset ($f3, 48) + cfi_rel_offset ($f4, 56) + cfi_rel_offset ($f5, 64) + rfpcr $f3 + + _ITOFT2 X, $f0, 16, Y, $f1, 24 + fcvtld $f0, $f4 + fcvtld $f1, $f5 + fdivd $f4, $f5, $f0 + + /* Check to see if X fit in the double as an exact value. */ + sll X, (64-53), AT + fldd $f1, 8(sp) + sra AT, (64-53), AT + cmpeq X, AT, AT + beq AT, $x_big + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert and clean up. */ + fcvtdl_z $f0, $f4 + excb + + wfpcr $f3 + _FTOIT $f4, RV, 16 + fldd $f0, 0(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + cfi_restore ($f1) + cfi_remember_state + cfi_restore ($f0) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ldi sp, FRAME(sp) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stl t0, 32(sp) + stl t1, 40(sp) + stl t2, 16(sp) + stl t5, 24(sp) + cfi_rel_offset (t0, 32) + cfi_rel_offset (t1, 40) + cfi_rel_offset (t2, 16) + cfi_rel_offset (t5, 24) + +#define Q RV /* quotient */ +#define R t0 /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + /* The fixup code below can only handle unsigned values. */ + or X, Y, AT + mov $31, t5 + blt AT, $fix_sign_in +$fix_sign_in_ret1: + fcvtdl_z $f0, $f4 + + _FTOIT $f4, Q, 8 + .align 3 +$fix_sign_in_ret2: + fldd $f0, 0(sp) + stl t3, 0(sp) + cfi_restore ($f0) + cfi_rel_offset (t3, 0) + + mull Q, Y, QY + excb + stl t4, 8(sp) + wfpcr $f3 + cfi_rel_offset (t4, 8) + + subl QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subl X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldl t0, 32(sp) + ldl t1, 40(sp) + ldl t2, 16(sp) + bne t5, $fix_sign_out + +$fix_sign_out_ret: + ldl t3, 0(sp) + ldl t4, 8(sp) + ldl t5, 24(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldi sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_restore (t5) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addl SY, SY, SY + addl S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subl Q, S, Q + unop + subl QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addl SY, SY, SY + addl S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addl Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subl R, SY, t4 + + selne AT, t3, Q, Q + selne AT, t4, R, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 +$fix_sign_in: + /* If we got here, then X|Y is negative. Need to adjust everything + such that we're doing unsigned division in the fixup loop. */ + /* T5 records the changes we had to make: + bit 0: set if result should be negative. + bit 2: set if X was negated. + bit 3: set if Y was negated. + */ + xor X, Y, AT + cmplt AT, 0, t5 + cmplt X, 0, AT + negl X, t0 + + s4addl AT, t5, t5 + selne AT, t0, X, X + cmplt Y, 0, AT + negl Y, t0 + + s8addl AT, t5, t5 + selne AT, t0, Y, Y + unop + blbc t5, $fix_sign_in_ret1 + + fcvtdl_z $f0, $f4 + _FTOIT $f4, Q, 8 + .align 3 + negl Q, Q + br $fix_sign_in_ret2 + + .align 4 +$fix_sign_out: + /* Now we get to undo what we did above. */ + /* ??? Is this really faster than just increasing the size of + the stack frame and storing X and Y in memory? */ + and t5, 8, AT + negl Y, t4 + selne AT, t4, Y, Y + + and t5, 4, AT + negl X, t4 + selne AT, t4, X, X + + negl RV, t4 + sellbs t5, t4, RV, RV + + br $fix_sign_out_ret + + cfi_endproc + .size __divl, .-__divl + + DO_DIVBYZERO diff --git a/sysdeps/sw_64/divqu.S b/sysdeps/sw_64/divqu.S new file mode 100644 index 00000000..7b39201e --- /dev/null +++ b/sysdeps/sw_64/divqu.S @@ -0,0 +1,292 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + + +/* 64-bit unsigned long divide. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may be + clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. + + The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE + for cvttq/c even without /sui being set. It will not, however, properly + raise the exception, so we don't have to worry about FPCR_INED being clear + and so dying by SIGFPE. */ + /* transform to sw-instruct on 2016111216 */ + .text + .align 4 + .globl __divlu + .type __divlu, @funcnoplt + .usepv __divlu, no + + cfi_startproc + cfi_return_column (RA) +__divlu: + ldi sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + beq Y, DIVBYZERO + fstd $f0, 0(sp) + fstd $f1, 8(sp) + fstd $f3, 48(sp) + fstd $f4, 56(sp) + fstd $f5, 64(sp) + stl t0,32(sp) + stl t1,40(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + cfi_rel_offset ($f3, 48) + cfi_rel_offset ($f4, 56) + cfi_rel_offset ($f5, 64) + cfi_rel_offset (t0, 32) + cfi_rel_offset (t1, 40) + + rfpcr $f3 + /*add it for there has some err when with -mieee of + 0xffffffffffffffff/2*/ + rfpcr $f1 + fimovd $f1,t0 + ldi t1,3 + sll t1,58,t1 + bic t0,t1,t0 + ifmovd t0,$f1 + wfpcr $f1 + _ITOFT2 X, $f0, 16, Y, $f1, 24 + fcvtld $f0, $f4 + fcvtld $f1, $f5 + blt X, $x_is_neg + fdivd $f4, $f5, $f0 + + /* Check to see if Y was mis-converted as signed value. */ + fldd $f1, 8(sp) + blt Y, $y_is_neg + + /* Check to see if X fit in the double as an exact value. */ + srl X, 53, AT + bne AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert and clean up. */ + fcvtdl $f0, $f4 + wfpcr $f3 + _FTOIT $f4, RV, 16 + + ldl t0,32(sp) + ldl t1,40(sp) + fldd $f0, 0(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldi sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore ($f0) + cfi_restore ($f1) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state +$x_is_neg: + /* If we get here, X is so big that bit 63 is set, which made the + conversion come out negative. Fix it up lest we not even get + a good estimate. */ + ldih AT, 0x5f80 /* 2**64 as float. */ + fstd $f2, 24(sp) + fstd $f6, 72(sp) + cfi_rel_offset ($f2, 24) + cfi_rel_offset ($f5, 72) + _ITOFS AT, $f2, 16 + + .align 4 + faddd $f4, $f2, $f6 + unop + fdivd $f6, $f5, $f0 + unop + + /* Ok, we've now the divide issued. Continue with other checks. */ + fldd $f1, 8(sp) + unop + fldd $f2, 24(sp) + fldd $f6, 72(sp) + blt Y, $y_is_neg + cfi_restore ($f1) + cfi_restore ($f2) + cfi_restore ($f6) + cfi_remember_state /* for y_is_neg */ + + .align 4 +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stl t2, 16(sp) + stl t3, 24(sp) + cfi_rel_offset (t0, 32) + cfi_rel_offset (t1, 40) + cfi_rel_offset (t2, 16) + cfi_rel_offset (t3, 24) + +#define Q RV /* quotient */ +#define R t0 /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + fcvtdl $f0, $f4 + _FTOIT $f4, Q, 8 + mull Q, Y, QY + + .align 4 + stl t4, 8(sp) + excb + fldd $f0, 0(sp) + wfpcr $f3 + cfi_rel_offset (t4, 8) + cfi_restore ($f0) + + subl QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subl X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldl t4, 8(sp) + ldl t0, 32(sp) + ldl t1, 40(sp) + ldl t2, 16(sp) + + ldl t3, 24(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldi sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addl SY, SY, SY + addl S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subl Q, S, Q + unop + subl QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addl SY, SY, SY + addl S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addl Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subl R, SY, t4 + + selne AT, t3, Q, Q + selne AT, t4, R, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 + cfi_restore_state +$y_is_neg: + /* If we get here, Y is so big that bit 63 is set. The results + from the divide will be completely wrong. Fortunately, the + quotient must be either 0 or 1, so just compute it directly. */ + cmpule Y, X, RV + excb + wfpcr $f3 + fldd $f0, 0(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldl t0,32(sp) + ldl t1,40(sp) + ldi sp, FRAME(sp) + cfi_restore (t0) + cfi_restore (t1) + cfi_restore ($f0) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + cfi_endproc + .size __divlu, .-__divlu + + DO_DIVBYZERO diff --git a/sysdeps/sw_64/htonl.S b/sysdeps/sw_64/htonl.S new file mode 100644 index 00000000..7fc0aa24 --- /dev/null +++ b/sysdeps/sw_64/htonl.S @@ -0,0 +1,43 @@ +/* Copyright (C) 1996-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +ENTRY(htonl) +#ifdef PROF + ldgp gp, 0(pv) + .set noat + ldi AT, _mcount + call AT, (AT), _mcount + .set at + .prologue 1 +#else + .prologue 0 +#endif + + ins6b a0, 7, t0 # t0 = 0000000000AABBCC + ins1b a0, 3, t1 # t1 = 000000CCDD000000 + or t1, t0, t1 # t1 = 000000CCDDAABBCC + srl t1, 16, t2 # t2 = 0000000000CCDDAA + zapnot t1, 0x0A, t0 # t0 = 00000000DD00BB00 + zapnot t2, 0x05, t3 # t3 = 0000000000CC00AA + addw t0, t3, v0 # v0 = ssssssssDDCCBBAA + ret + + END(htonl) + +weak_alias (htonl, ntohl) diff --git a/sysdeps/sw_64/htons.S b/sysdeps/sw_64/htons.S new file mode 100644 index 00000000..8a981be1 --- /dev/null +++ b/sysdeps/sw_64/htons.S @@ -0,0 +1,39 @@ +/* Copyright (C) 1996-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +ENTRY(htons) +#ifdef PROF + ldgp gp, 0(pv) + .set noat + ldi AT, _mcount + call AT, (AT), _mcount + .set at + .prologue 1 +#else + .prologue 0 +#endif + + ext5b a0, 7, t1 # t1 = bb00 + ext0b a0, 1, v0 # v0 = 00aa + bis v0, t1, v0 # v0 = bbaa + ret + + END(htons) + +weak_alias (htons, ntohs) diff --git a/sysdeps/sw_64/ldiv.S b/sysdeps/sw_64/ldiv.S new file mode 100644 index 00000000..7a77d6dd --- /dev/null +++ b/sysdeps/sw_64/ldiv.S @@ -0,0 +1,222 @@ + +/* Copyright (C) 1996-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Richard Henderson . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + + +#include "div_libc.h" + +#undef FRAME +#ifdef __sw_64_fix__ +#define FRAME 0 +#else +#define FRAME 16 +#endif + +#undef X +#undef Y +#define X $17 +#define Y $18 + + .set noat + + .align 4 + .globl ldiv + .ent ldiv +ldiv: + .frame sp, FRAME, ra +#if FRAME > 0 + ldi sp, -FRAME(sp) +#endif +#ifdef PROF + .set macro + ldgp gp, 0(pv) + ldi AT, _mcount + call AT, (AT), _mcount + .set nomacro + .prologue 1 +#else + .prologue 0 +#endif + + beq Y, $divbyzero + mov Y,t6 + nop + rfpcr $f10 + + _ITOFT2 X, $f0, 0, Y, $f1, 8 + + .align 4 + fcvtld $f0, $f11 + fcvtld $f1, $f12 + fdivd $f11, $f12, $f0 + unop + + /* Check to see if X fit in the double as an exact value. */ + sll X, (64-53), AT + sra AT, (64-53), AT + cmpeq X, AT, AT + beq AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert and clean up. */ + fcvtdl_z $f0, $f11 + nop + wfpcr $f10 + _FTOIT $f11, $0, 0 + +$egress: +// mull $0, Y, $1 + mull $0, t6, $1 + subl X, $1, $1 + + stl $0, 0($16) + stl $1, 8($16) + mov $16, $0 + +#if FRAME > 0 + ldi sp, FRAME(sp) +#endif + ret + + .align 4 +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + +#define Q v0 /* quotient */ +#define R t0 /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + /* The fixup code below can only handle unsigned values. */ + bis X, Y, AT + mov $31, t5 + blt AT, $fix_sign_in +$fix_sign_in_ret1: + fcvtdl_z $f0, $f11 + + _FTOIT $f11, Q, 8 +$fix_sign_in_ret2: + mull Q, Y, QY + nop + wfpcr $f10 + + .align 4 + subl QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subl X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + negl Q, t4 + sellbs t5, t4, Q, Q + br $egress + + .align 4 + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addl SY, SY, SY + addl S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subl Q, S, Q + unop + subl QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addl SY, SY, SY + addl S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addl Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subl R, SY, t4 + + selne AT, t3, Q, Q + selne AT, t4, R, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 +$fix_sign_in: + /* If we got here, then X|Y is negative. Need to adjust everything + such that we're doing unsigned division in the fixup loop. */ + /* T5 is true if result should be negative. */ + xor X, Y, AT + cmplt AT, 0, t5 + cmplt X, 0, AT + negl X, t0 + + selne AT, t0, X, X + cmplt Y, 0, AT + negl Y, t0 + + selne AT, t0, Y, Y + blbc t5, $fix_sign_in_ret1 + + fcvtdl_z $f0, $f11 + _FTOIT $f11, Q, 8 + .align 3 + negl Q, Q + br $fix_sign_in_ret2 + +$divbyzero: + mov a0, v0 + ldi a0, GEN_INTDIV + sys_call HMC_gentrap + stl zero, 0(v0) + stl zero, 8(v0) + +#if FRAME > 0 + ldi sp, FRAME(sp) +#endif + ret + + .end ldiv + +weak_alias (ldiv, lldiv) +weak_alias (ldiv, imaxdiv) diff --git a/sysdeps/sw_64/lldiv.S b/sysdeps/sw_64/lldiv.S new file mode 100644 index 00000000..8a8ef97a --- /dev/null +++ b/sysdeps/sw_64/lldiv.S @@ -0,0 +1 @@ +/* lldiv is the same as ldiv on the Sw_64. */ diff --git a/sysdeps/sw_64/lshift.S b/sysdeps/sw_64/lshift.S new file mode 100644 index 00000000..700e9d80 --- /dev/null +++ b/sysdeps/sw_64/lshift.S @@ -0,0 +1,107 @@ + # Sw_64 1621 __mpn_lshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldl and stl can be paired with the other used + # instructions. But there are many restrictions in the 1621 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addl $18,$17,$17 # make r17 point at end of s1 + ldl $4,-8($17) # load first limb + subl $17,8,$17 + subl $31,$19,$7 + s8addl $18,$16,$16 # make r16 point at end of RES + subl $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + srl $4,$7,$0 # compute function result + + beq $20,.L0 + subl $18,$20,$18 + + .align 3 +.Loop0: + ldl $3,-8($17) + subl $16,8,$16 + subl $17,8,$17 + subl $20,1,$20 + sll $4,$19,$5 + srl $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stl $8,0($16) + bne $20,.Loop0 + +.L0: beq $18,.Lend + + .align 3 +.Loop: ldl $3,-8($17) + subl $16,32,$16 + subl $18,4,$18 + sll $4,$19,$5 + srl $3,$7,$6 + + ldl $4,-16($17) + sll $3,$19,$1 + bis $5,$6,$8 + stl $8,24($16) + srl $4,$7,$2 + + ldl $3,-24($17) + sll $4,$19,$5 + bis $1,$2,$8 + stl $8,16($16) + srl $3,$7,$6 + + ldl $4,-32($17) + sll $3,$19,$1 + bis $5,$6,$8 + stl $8,8($16) + srl $4,$7,$2 + + subl $17,32,$17 + bis $1,$2,$8 + stl $8,0($16) + + bgt $18,.Loop + +.Lend: sll $4,$19,$8 + stl $8,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/sw_64/mul_1.S b/sysdeps/sw_64/mul_1.S new file mode 100644 index 00000000..127f4274 --- /dev/null +++ b/sysdeps/sw_64/mul_1.S @@ -0,0 +1,82 @@ + # Sw_64 1621 __mpn_mul_1 -- Multiply a limb vector with a limb and store + # the result in a second limb vector. + + # Copyright (C) 1992-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + + # To improve performance for long fmuldiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Sw_64 + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_mul_1 + .ent __mpn_mul_1 2 +__mpn_mul_1: + .frame $30,0,$26 + + ldl $2,0($17) # $2 = s1_limb + subl $18,1,$18 # size-- + mull $2,$19,$3 # $3 = prod_low + bic $31,$31,$4 # clear cy_limb + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldl $2,8($17) # $2 = s1_limb + subl $18,1,$18 # size-- + stl $3,0($16) + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mull $2,$19,$3 # $3 = prod_low + addl $4,$0,$0 # cy_limb = cy_limb + 'cy' + subl $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldl $2,16($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + addl $3,$0,$3 # $3 = cy_limb + prod_low + stl $3,8($16) + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addl $16,8,$16 # res_ptr++ + bne $18,Loop + +Lend2: mull $2,$19,$3 # $3 = prod_low + addl $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addl $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + stl $3,8($16) + addl $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: stl $3,0($16) + ret $31,($26),1 + + .end __mpn_mul_1 diff --git a/sysdeps/sw_64/reml.S b/sysdeps/sw_64/reml.S new file mode 100644 index 00000000..56a550d9 --- /dev/null +++ b/sysdeps/sw_64/reml.S @@ -0,0 +1,93 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + Contributed by Richard Henderson + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + +/* 32-bit signed int remainder. This is not a normal C function. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + The FPU can handle the division for all input values except zero. + All we have to do is compute the remainder via multiply-and-subtract. + + The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE + for cvttq/c even without /sui being set. It will not, however, properly + raise the exception, so we don't have to worry about FPCR_INED being clear + and so dying by SIGFPE. */ + /*__reml->__remw 20161111*/ +#ifndef EXTEND +#define EXTEND(S,D) sextl S, D +#endif + + .text + .align 4 + .globl __remw + .type __remw, @funcnoplt + .usepv __remw, no + + cfi_startproc + cfi_return_column (RA) +__remw: + ldi sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + fstd $f0, 0(sp) + excb + beq Y, DIVBYZERO + + fstd $f1, 8(sp) + fstd $f2, 16(sp) + fstd $f3, 40(sp) + fstd $f4, 48(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + cfi_rel_offset ($f2, 16) + cfi_rel_offset ($f3, 40) + cfi_rel_offset ($f4, 48) + + rfpcr $f2 + EXTEND (X, RV) + EXTEND (Y, AT) + _ITOFT2 RV, $f0, 24, AT, $f1, 32 + fcvtld $f0, $f3 + fcvtld $f1, $f4 + fdivd $f3, $f4, $f0 + fcvtdl_z $f0, $f3 + + wfpcr $f2 + _FTOIT $f3, RV, 24 + fldd $f0, 0(sp) + mulw RV, Y, RV + fldd $f1, 8(sp) + fldd $f2, 16(sp) + fldd $f3, 40(sp) + fldd $f4, 48(sp) + ldi sp, FRAME(sp) + cfi_restore ($f0) + cfi_restore ($f1) + cfi_restore ($f2) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_def_cfa_offset (0) + subw X, RV, RV + ret $31, (RA), 1 + + cfi_endproc + .size __remw, .-__remw + + DO_DIVBYZERO diff --git a/sysdeps/sw_64/remlu.S b/sysdeps/sw_64/remlu.S new file mode 100644 index 00000000..3c12f7bf --- /dev/null +++ b/sysdeps/sw_64/remlu.S @@ -0,0 +1,4 @@ +#define UNSIGNED +#define EXTEND(S,D) zapnot S, 15, D +#define __remw __remwu +#include diff --git a/sysdeps/sw_64/remq.S b/sysdeps/sw_64/remq.S new file mode 100644 index 00000000..6db7f628 --- /dev/null +++ b/sysdeps/sw_64/remq.S @@ -0,0 +1,274 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + + +/* 64-bit signed long remainder. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may + be clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. + + The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE + for cvttq/c even without /sui being set. It will not, however, properly + raise the exception, so we don't have to worry about FPCR_INED being clear + and so dying by SIGFPE. */ + .text + .align 4 + .globl __reml + .type __reml, @funcnoplt + .usepv __reml, no + + cfi_startproc + cfi_return_column (RA) +__reml: + ldi sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + fstd $f0, 0(sp) + excb + beq Y, DIVBYZERO + + fstd $f1, 8(sp) + fstd $f3, 48(sp) + fstd $f4, 56(sp) + fstd $f5, 64(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + cfi_rel_offset ($f3, 48) + cfi_rel_offset ($f4, 56) + cfi_rel_offset ($f5, 64) + + rfpcr $f3 + _ITOFT2 X, $f0, 16, Y, $f1, 24 + fcvtld $f0, $f4 + fcvtld $f1, $f5 + fdivd $f4, $f5, $f0 + + /* Check to see if X fit in the double as an exact value. */ + sll X, (64-53), AT + fldd $f1, 8(sp) + sra AT, (64-53), AT + cmpeq X, AT, AT + beq AT, $x_big + fcvtdl_z $f0, $f4 + + wfpcr $f3 + _FTOIT $f4, AT, 16 + mull AT, Y, AT + fldd $f0, 0(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + cfi_restore ($f1) + cfi_remember_state + cfi_restore ($f0) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ldi sp, FRAME(sp) + subl X, AT, RV + ret $31, (RA), 1 + + .align 4 + cfi_restore_state +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stl t0, 32(sp) + stl t1, 40(sp) + stl t2, 16(sp) + stl t5, 24(sp) + cfi_rel_offset (t0, 32) + cfi_rel_offset (t1, 40) + cfi_rel_offset (t2, 16) + cfi_rel_offset (t5, 24) + +#define Q t0 /* quotient */ +#define R RV /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + /* The fixup code below can only handle unsigned values. */ + or X, Y, AT + mov $31, t5 + blt AT, $fix_sign_in +$fix_sign_in_ret1: + fcvtdl_z $f0, $f4 + _FTOIT $f4, Q, 8 + .align 3 +$fix_sign_in_ret2: + fldd $f0, 0(sp) + stl t3, 0(sp) + cfi_restore ($f0) + cfi_rel_offset (t3, 0) + + mull Q, Y, QY + stl t4, 8(sp) + wfpcr $f3 + cfi_rel_offset (t4, 8) + + subl QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subl X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldl t0, 32(sp) + ldl t1, 40(sp) + ldl t2, 16(sp) + bne t5, $fix_sign_out + +$fix_sign_out_ret: + ldl t3, 0(sp) + ldl t4, 8(sp) + ldl t5, 24(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldi sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_restore (t5) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addl SY, SY, SY + addl S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subl Q, S, Q + unop + subl QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addl SY, SY, SY + addl S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addl Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subl R, SY, t4 + + selne AT, t3, Q, Q + selne AT, t4, R, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 +$fix_sign_in: + /* If we got here, then X|Y is negative. Need to adjust everything + such that we're doing unsigned division in the fixup loop. */ + /* T5 records the changes we had to make: + bit 0: set if X was negated. Note that the sign of the + remainder follows the sign of the divisor. + bit 2: set if Y was negated. + */ + xor X, Y, t1 + cmplt X, 0, t5 + negl X, t0 + selne t5, t0, X, X + + cmplt Y, 0, AT + negl Y, t0 + s4addl AT, t5, t5 + selne AT, t0, Y, Y + + bge t1, $fix_sign_in_ret1 + fcvtdl_z $f0, $f4 + _FTOIT $f4, Q, 8 + .align 3 + negl Q, Q + br $fix_sign_in_ret2 + + .align 4 +$fix_sign_out: + /* Now we get to undo what we did above. */ + /* ??? Is this really faster than just increasing the size of + the stack frame and storing X and Y in memory? */ + and t5, 4, AT + negl Y, t4 + selne AT, t4, Y, Y + + negl X, t4 + sellbs t5, t4, X, X + negl RV, t4 + sellbs t5, t4, RV, RV + + br $fix_sign_out_ret + + cfi_endproc + .size __reml, .-__reml + + DO_DIVBYZERO diff --git a/sysdeps/sw_64/remqu.S b/sysdeps/sw_64/remqu.S new file mode 100644 index 00000000..946e031b --- /dev/null +++ b/sysdeps/sw_64/remqu.S @@ -0,0 +1,292 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include "div_libc.h" + + +/* 64-bit unsigned long remainder. These are not normal C functions. Argument + registers are t10 and t11, the result goes in t12. Only t12 and AT may be + clobbered. + + Theory of operation here is that we can use the FPU divider for virtually + all operands that we see: all dividend values between -2**53 and 2**53-1 + can be computed directly. Note that divisor values need not be checked + against that range because the rounded fp value will be close enough such + that the quotient is < 1, which will properly be truncated to zero when we + convert back to integer. + + When the dividend is outside the range for which we can compute exact + results, we use the fp quotent as an estimate from which we begin refining + an exact integral value. This reduces the number of iterations in the + shift-and-subtract loop significantly. + + The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE + for cvttq/c even without /sui being set. It will not, however, properly + raise the exception, so we don't have to worry about FPCR_INED being clear + and so dying by SIGFPE. */ + .text + .align 4 + .globl __remlu + .type __remlu, @funcnoplt + .usepv __remlu, no + + cfi_startproc + cfi_return_column (RA) +__remlu: + ldi sp, -FRAME(sp) + cfi_def_cfa_offset (FRAME) + CALL_MCOUNT + + /* Get the fp divide insn issued as quickly as possible. After + that's done, we have at least 22 cycles until its results are + ready -- all the time in the world to figure out how we're + going to use the results. */ + subl Y, 1, AT + and Y, AT, AT + beq AT, $powerof2 + fstd $f0, 0(sp) + + + fstd $f1, 8(sp) + fstd $f3, 48(sp) + fstd $f4, 56(sp) + fstd $f5, 64(sp) + cfi_rel_offset ($f0, 0) + cfi_rel_offset ($f1, 8) + cfi_rel_offset ($f3, 48) + cfi_rel_offset ($f4, 56) + cfi_rel_offset ($f5, 64) + + rfpcr $f3 + _ITOFT2 X, $f0, 16, Y, $f1, 24 + + fcvtld $f0, $f4 + fcvtld $f1, $f5 + + blt X, $x_is_neg +setfpec1 + fdivd $f4, $f5, $f0 + + /* Check to see if Y was mis-converted as signed value. */ + fldd $f1, 8(sp) + blt Y, $y_is_neg + + /* Check to see if X fit in the double as an exact value. */ + srl X, 53, AT + bne AT, $x_big + + /* If we get here, we're expecting exact results from the division. + Do nothing else besides convert, compute remainder, clean up. */ + fcvtdl_z $f0, $f4 + wfpcr $f3 + _FTOIT $f4, AT, 16 + mull AT, Y, AT + fldd $f0, 0(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldi sp, FRAME(sp) + cfi_remember_state + cfi_restore ($f0) + cfi_restore ($f1) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + + .align 4 + subl X, AT, RV + ret $31, (RA), 1 + .align 4 + cfi_restore_state +$x_is_neg: + /* If we get here, X is so big that bit 63 is set, which made the + conversion come out negative. Fix it up lest we not even get + a good estimate. */ + ldih AT, 0x5f80 /* 2**64 as float. */ + fstd $f2, 24(sp) + fstd $f6, 72(sp) + cfi_rel_offset ($f2, 24) + cfi_rel_offset ($f6, 72) + _ITOFS AT, $f2, 16 + .align 4 + faddd $f4, $f2, $f6 + fdivd $f6, $f5, $f0 + + /* Ok, we've now the divide issued. Continue with other checks. */ +# .align 4 + fldd $f1, 8(sp) + unop + fldd $f2, 24(sp) + fldd $f6, 72(sp) + blt Y, $y_is_neg + cfi_restore ($f1) + cfi_restore ($f2) + cfi_restore ($f6) + cfi_remember_state /* for y_is_neg */ + + .align 4 +$x_big: + /* If we get here, X is large enough that we don't expect exact + results, and neither X nor Y got mis-translated for the fp + division. Our task is to take the fp result, figure out how + far it's off from the correct result and compute a fixup. */ + stl t0, 32(sp) + stl t1, 40(sp) + stl t2, 16(sp) + stl t3, 24(sp) + cfi_rel_offset (t0, 32) + cfi_rel_offset (t1, 40) + cfi_rel_offset (t2, 16) + cfi_rel_offset (t3, 24) + +#define Q t0 /* quotient */ +#define R RV /* remainder */ +#define SY t1 /* scaled Y */ +#define S t2 /* scalar */ +#define QY t3 /* Q*Y */ + + fcvtdl_z $f0, $f4 + _FTOIT $f4, Q, 8 + mull Q, Y, QY + + .align 4 + stl t4, 8(sp) + excb + fldd $f0, 0(sp) + wfpcr $f3 + cfi_rel_offset (t4, 8) + cfi_restore ($f0) + + subl QY, X, R + mov Y, SY + mov 1, S + bgt R, $q_high + +$q_high_ret: + subl X, QY, R + mov Y, SY + mov 1, S + bgt R, $q_low + +$q_low_ret: + ldl t4, 8(sp) + ldl t0, 32(sp) + ldl t1, 40(sp) + ldl t2, 16(sp) + + ldl t3, 24(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + ldi sp, FRAME(sp) + cfi_remember_state + cfi_restore (t0) + cfi_restore (t1) + cfi_restore (t2) + cfi_restore (t3) + cfi_restore (t4) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + .align 4 + cfi_restore_state + /* The quotient that we computed was too large. We need to reduce + it by S such that Y*S >= R. Obviously the closer we get to the + correct value the better, but overshooting high is ok, as we'll + fix that up later. */ +0: + addl SY, SY, SY + addl S, S, S +$q_high: + cmpult SY, R, AT + bne AT, 0b + + subl Q, S, Q + unop + subl QY, SY, QY + br $q_high_ret + + .align 4 + /* The quotient that we computed was too small. Divide Y by the + current remainder (R) and add that to the existing quotient (Q). + The expectation, of course, is that R is much smaller than X. */ + /* Begin with a shift-up loop. Compute S such that Y*S >= R. We + already have a copy of Y in SY and the value 1 in S. */ +0: + addl SY, SY, SY + addl S, S, S +$q_low: + cmpult SY, R, AT + bne AT, 0b + + /* Shift-down and subtract loop. Each iteration compares our scaled + Y (SY) with the remainder (R); if SY <= R then X is divisible by + Y's scalar (S) so add it to the quotient (Q). */ +2: addl Q, S, t3 + srl S, 1, S + cmpule SY, R, AT + subl R, SY, t4 + + selne AT, t3, Q, Q + selne AT, t4, R, R + srl SY, 1, SY + bne S, 2b + + br $q_low_ret + + .align 4 + cfi_restore_state +$y_is_neg: + /* If we get here, Y is so big that bit 63 is set. The results + from the divide will be completely wrong. Fortunately, the + quotient must be either 0 or 1, so the remainder must be X + or X-Y, so just compute it directly. */ + cmpule Y, X, AT + nop + wfpcr $f3 + subl X, Y, RV + fldd $f0, 0(sp) + fldd $f3, 48(sp) + fldd $f4, 56(sp) + fldd $f5, 64(sp) + seleq AT, X, RV, RV + + ldi sp, FRAME(sp) + cfi_restore ($f0) + cfi_restore ($f3) + cfi_restore ($f4) + cfi_restore ($f5) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + .align 4 + cfi_def_cfa_offset (FRAME) +$powerof2: + subl Y, 1, AT + beq Y, DIVBYZERO + and X, AT, RV + ldi sp, FRAME(sp) + cfi_def_cfa_offset (0) + ret $31, (RA), 1 + + cfi_endproc + .size __remlu, .-__remlu + + DO_DIVBYZERO diff --git a/sysdeps/sw_64/rshift.S b/sysdeps/sw_64/rshift.S new file mode 100644 index 00000000..81b3d742 --- /dev/null +++ b/sysdeps/sw_64/rshift.S @@ -0,0 +1,105 @@ + # Sw_64 1621 __mpn_rshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldl and stl can be paired with the other used + # instructions. But there are many restrictions in the 1621 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldl $4,0($17) # load first limb + addl $17,8,$17 + subl $31,$19,$7 + subl $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + sll $4,$7,$0 # compute function result + + beq $20,.L0 + subl $18,$20,$18 + + .align 3 +.Loop0: + ldl $3,0($17) + addl $16,8,$16 + addl $17,8,$17 + subl $20,1,$20 + srl $4,$19,$5 + sll $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stl $8,-8($16) + bne $20,.Loop0 + +.L0: beq $18,.Lend + + .align 3 +.Loop: ldl $3,0($17) + addl $16,32,$16 + subl $18,4,$18 + srl $4,$19,$5 + sll $3,$7,$6 + + ldl $4,8($17) + srl $3,$19,$1 + bis $5,$6,$8 + stl $8,-32($16) + sll $4,$7,$2 + + ldl $3,16($17) + srl $4,$19,$5 + bis $1,$2,$8 + stl $8,-24($16) + sll $3,$7,$6 + + ldl $4,24($17) + srl $3,$19,$1 + bis $5,$6,$8 + stl $8,-16($16) + sll $4,$7,$2 + + addl $17,32,$17 + bis $1,$2,$8 + stl $8,-8($16) + + bgt $18,.Loop + +.Lend: srl $4,$19,$8 + stl $8,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/sw_64/sub_n.S b/sysdeps/sw_64/sub_n.S new file mode 100644 index 00000000..d0d5a30c --- /dev/null +++ b/sysdeps/sw_64/sub_n.S @@ -0,0 +1,118 @@ + # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + ldl $3,0($17) + ldl $4,0($18) + + subl $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop + + subl $19,$2,$19 + +.Loop0: subl $2,1,$2 + ldl $5,8($17) + addl $4,$0,$4 + ldl $6,8($18) + cmpult $4,$0,$1 + subl $3,$4,$4 + cmpult $3,$4,$0 + stl $4,0($16) + or $0,$1,$0 + + addl $17,8,$17 + addl $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addl $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subl $19,4,$19 + + ldl $5,8($17) + addl $4,$0,$4 + ldl $6,8($18) + cmpult $4,$0,$1 + subl $3,$4,$4 + cmpult $3,$4,$0 + stl $4,0($16) + or $0,$1,$0 + + ldl $3,16($17) + addl $6,$0,$6 + ldl $4,16($18) + cmpult $6,$0,$1 + subl $5,$6,$6 + cmpult $5,$6,$0 + stl $6,8($16) + or $0,$1,$0 + + ldl $5,24($17) + addl $4,$0,$4 + ldl $6,24($18) + cmpult $4,$0,$1 + subl $3,$4,$4 + cmpult $3,$4,$0 + stl $4,16($16) + or $0,$1,$0 + + ldl $3,32($17) + addl $6,$0,$6 + ldl $4,32($18) + cmpult $6,$0,$1 + subl $5,$6,$6 + cmpult $5,$6,$0 + stl $6,24($16) + or $0,$1,$0 + + addl $17,32,$17 + addl $18,32,$18 + addl $16,32,$16 + bne $19,.Loop + +.Lend: addl $4,$0,$4 + cmpult $4,$0,$1 + subl $3,$4,$4 + cmpult $3,$4,$0 + stl $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_sub_n diff --git a/sysdeps/sw_64/submul_1.S b/sysdeps/sw_64/submul_1.S new file mode 100644 index 00000000..2cad2bef --- /dev/null +++ b/sysdeps/sw_64/submul_1.S @@ -0,0 +1,89 @@ + # Sw_64 1621 __mpn_submul_1 -- Multiply a limb vector with a limb and + # fsubdract the result from a second limb vector. + + # Copyright (C) 1992-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_submul_1 + .ent __mpn_submul_1 2 +__mpn_submul_1: + .frame $30,0,$26 + + ldl $2,0($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + subl $18,1,$18 # size-- + mull $2,$19,$3 # $3 = prod_low + ldl $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,.Lend1 # jump if size was == 1 + ldl $2,0($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + subl $18,1,$18 # size-- + subl $5,$3,$3 + cmpult $5,$3,$4 + stl $3,0($16) + addl $16,8,$16 # res_ptr++ + beq $18,.Lend2 # jump if size was == 2 + + .align 3 +.Loop: mull $2,$19,$3 # $3 = prod_low + ldl $5,0($16) # $5 = *res_ptr + addl $4,$0,$0 # cy_limb = cy_limb + 'cy' + subl $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldl $2,0($17) # $2 = s1_limb + addl $17,8,$17 # s1_ptr++ + addl $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subl $5,$3,$3 + cmpult $5,$3,$5 + stl $3,0($16) + addl $16,8,$16 # res_ptr++ + addl $5,$0,$0 # combine carries + bne $18,.Loop + +.Lend2: mull $2,$19,$3 # $3 = prod_low + ldl $5,0($16) # $5 = *res_ptr + addl $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addl $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subl $5,$3,$3 + cmpult $5,$3,$5 + stl $3,0($16) + addl $5,$0,$0 # combine carries + addl $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +.Lend1: subl $5,$3,$3 + cmpult $5,$3,$5 + stl $3,0($16) + addl $0,$5,$0 + ret $31,($26),1 + + .end __mpn_submul_1 diff --git a/sysdeps/sw_64/sw6a/add_n.S b/sysdeps/sw_64/sw6a/add_n.S new file mode 100644 index 00000000..86e9f9ae --- /dev/null +++ b/sysdeps/sw_64/sw6a/add_n.S @@ -0,0 +1,146 @@ + # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subl $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldl $0,0($18) + ldl $1,8($18) + ldl $4,0($17) + ldl $5,8($17) + addl $17,32,$17 # update s1_ptr + ldl $2,16($18) + addl $0,$4,$20 # 1st main add + ldl $3,24($18) + subl $19,4,$19 # decr loop cnt + ldl $6,-16($17) + cmpult $20,$0,$25 # compute cy from last add + ldl $7,-8($17) + addl $1,$25,$28 # cy add + addl $18,32,$18 # update s2_ptr + addl $5,$28,$21 # 2nd main add + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $21,$28,$25 # compute cy from last add + ldl $0,0($18) + or $8,$25,$25 # combine cy from the two fadds + ldl $1,8($18) + addl $2,$25,$28 # cy add + ldl $4,0($17) + addl $28,$6,$22 # 3rd main add + ldl $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + addl $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + addl $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + addl $0,$25,$28 # cy add + ldl $2,16($18) + addl $4,$28,$20 # 1st main add + ldl $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldl $6,-16($17) + cmpult $20,$28,$25 # compute cy from last add + ldl $7,-8($17) + or $8,$25,$25 # combine cy from the two fadds + subl $19,4,$19 # decr loop cnt + stl $22,-16($16) + addl $1,$25,$28 # cy add + stl $23,-8($16) + addl $5,$28,$21 # 2nd main add + addl $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $21,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $2,$25,$28 # cy add + addl $28,$6,$22 # 3rd main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + addl $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + stl $22,-16($16) + stl $23,-8($16) +.Lend2: addl $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldl $0,0($18) + ldl $4,0($17) + subl $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addl $0,$25,$28 # cy add + ldl $0,8($18) + addl $4,$28,$20 # main add + ldl $4,8($17) + addl $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addl $17,8,$17 + stl $20,0($16) + cmpult $20,$28,$25 # compute cy from last add + subl $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two fadds + addl $16,8,$16 + bne $19,.Loop0 +.Lend0: addl $0,$25,$28 # cy add + addl $4,$28,$20 # main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $20,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_add_n diff --git a/sysdeps/sw_64/sw6a/addmul_1.S b/sysdeps/sw_64/sw6a/addmul_1.S new file mode 100644 index 00000000..287e8573 --- /dev/null +++ b/sysdeps/sw_64/sw6a/addmul_1.S @@ -0,0 +1,475 @@ + # Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + # + # Copyright (C) 2000-2023 Free Software Foundation, Inc. + # + # This file is part of the GNU MP Library. + # + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published + # by the Free Software Foundation; either version 2.1 of the License, or (at + # your option) any later version. + # + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + # + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # size $18 + # s2_limb $19 + # + # + # This code was written in close cooperation with pipeline expert + # . Any errors are tege's fault, though. + # + # Register usages for unrolled loop: + # 0-3 mul's + # 4-7 acc's + # 8-15 mul results + # 20,21 carry's + # 22,23 save for stores + # + # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop. + # + # The stores can issue a cycle late so we have paired no-op's to 'catch' + # them, so that further disturbance to the schedule is damped. + # + # We couldn't pair the loads, because the entangled schedule of the + # carry's has to happen on one side {0} of the machine. Note, the total + # use of U0, and the total use of L0 (after attending to the stores). + # which is part of the reason why.... + # + # This is a great schedule for the d_cache, a poor schedule for the + # b_cache. The lockup on U0 means that any stall can't be recovered + # from. Consider a ldl in L1. say that load gets stalled because it + # collides with a fill from the b_Cache. On the next cycle, this load + # gets priority. If first looks at L0, and goes there. The instruction + # we intended for L0 gets to look at L1, which is NOT where we want + # it. It either stalls 1, because it can't go in L0, or goes there, and + # causes a further instruction to stall. + # + # So for b_cache, we're likely going to want to put one or more cycles + # back into the code! And, of course, put in prefetches. For the + # accumulator, flds, intent to modify. For the fmuldiplier, you might + # want ldl, evict next, if you're not wanting to use it again soon. Use + # 256 ahead of present pointer value. At a place where we have an mt + # followed by a bookkeeping, put the bookkeeping in upper, and the + # prefetch into lower. + # + # Note, the usage of physical registers per cycle is smoothed off, as + # much as possible. + # + # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd + # like not to have a ldl or stl to preceded a conditional branch in a + # quadpack. The conditional branch moves the retire pointer one cycle + # later. + # + # Optimization notes: + # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? + # Reserved regs: $29 $30 $31 + # Free caller-saves regs in unrolled code: $24 $25 $28 + # We should swap some of the callee-saves regs for some of the free + # caller-saves regs, saving some overhead cycles. + # Most importantly, we should write fast code for the 0-7 case. + # The code we use there are for the 21164, and runs at 7 cycles/limb + # on the 21264. Should not be hard, if we write specialized code for + # 1-7 limbs (the one for 0 limbs should be straightforward). We then just + # need a jump table indexed by the low 3 bits of the count argument. + + .set noreorder + .set noat + .text + + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .frame $30,0,$26,0 + .prologue 0 + + cmpult $18, 8, $1 + beq $1, $Large + + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $18, 1, $18 # size-- + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $18, $Lend0b # jump if size was == 1 + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $18, 1, $18 # size-- + addl $5, $3, $3 + cmpult $3, $5, $4 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + beq $18, $Lend0a # jump if size was == 2 + + .align 3 +$Loop0: mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + subl $18, 1, $18 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + bne $18, $Loop0 +$Lend0a: + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $5, $0, $0 # combine carries + addl $4, $0, $0 # cy_limb = prod_high + cy + ret $31, ($26), 1 +$Lend0b: + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $0, $5, $0 + ret $31, ($26), 1 + +$Large: + ldi $30, -240($30) + stl $9, 8($30) + stl $10, 16($30) + stl $11, 24($30) + stl $12, 32($30) + stl $13, 40($30) + stl $14, 48($30) + stl $15, 56($30) + + and $18, 7, $20 # count for the first loop, 0-7 + srl $18, 3, $18 # count for unrolled loop + bis $31, $31, $0 + beq $20, $Lunroll + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $20, 1, $20 # size-- + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $20, $Lend1b # jump if size was == 1 + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $20, 1, $20 # size-- + addl $5, $3, $3 + cmpult $3, $5, $4 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + beq $20, $Lend1a # jump if size was == 2 + + .align 3 +$Loop1: mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + subl $20, 1, $20 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + bne $20, $Loop1 + +$Lend1a: + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + addl $4, $0, $0 # cy_limb = prod_high + cy + br $31, $Lunroll +$Lend1b: + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $0, $5, $0 + +$Lunroll: + ldi $17, -16($17) # L1 bookkeeping + ldi $16, -16($16) # L1 bookkeeping + bis $0, $31, $12 + + # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldl $2, 16($17) # L1 + ldl $3, 24($17) # L1 + ldi $18, -1($18) # L1 bookkeeping + ldl $6, 16($16) # L1 + ldl $7, 24($16) # L1 + ldl $0, 32($17) # L1 + mull $19, $2, $13 # U1 + ldl $1, 40($17) # L1 + umulh $19, $2, $14 # U1 + mull $19, $3, $15 # U1 + ldi $17, 64($17) # L1 bookkeeping + ldl $4, 32($16) # L1 + ldl $5, 40($16) # L1 + umulh $19, $3, $8 # U1 + ldl $2, -16($17) # L1 + mull $19, $0, $9 # U1 + ldl $3, -8($17) # L1 + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + mull $19, $1, $11 # U1 + cmpult $6, $13, $20 # L0 lo add => carry + ldi $16, 64($16) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, -16($16) # L1 + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, -8($16) # L1 + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 0($17) # L1 + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 8($17) # L1 + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # L0 lo + acc + stl $22, -48($16) # L0 + stl $23, -40($16) # L1 + mull $19, $3, $15 # U1 + addl $8, $21, $8 # U0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + ble $18, $Lend # U1 bookkeeping + + # ____ MAIN UNROLLED LOOP ____ + .align 4 +$Loop: + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 0($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 8($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + ldl $2, 16($17) # L1 + + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + ldl $3, 24($17) # L1 + + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, -32($16) # L0 + stl $23, -24($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addl $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $18, -1($18) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, 16($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, 24($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 32($17) # L1 + + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 40($17) # L1 + + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # U0 lo + acc + stl $22, -16($16) # L0 + stl $23, -8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addl $8, $21, $8 # L0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $17, 64($17) # L1 bookkeeping + addl $4, $8, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 32($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 40($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + ldl $2, -16($17) # L1 + + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + ldl $3, -8($17) # L1 + + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, 0($16) # L0 + stl $23, 8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addl $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $16, 64($16) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, -16($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, -8($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 0($17) # L1 + + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 8($17) # L1 + + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # L0 lo + acc + stl $22, -48($16) # L0 + stl $23, -40($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addl $8, $21, $8 # U0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + bis $31, $31, $31 # L1 mt + bgt $18, $Loop # U1 bookkeeping + +# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 0($16) # L1 + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 8($16) # L1 + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, -32($16) # L0 + stl $23, -24($16) # L1 + mull $19, $1, $11 # U1 + addl $12, $21, $12 # U0 hi mul + carry + cmpult $6, $13, $20 # L0 lo add => carry + addl $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + addl $4, $9, $4 # U0 lo + acc + stl $22, -16($16) # L0 + stl $23, -8($16) # L1 + bis $31, $31, $31 # L0 st slosh + addl $8, $21, $8 # L0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + stl $22, 0($16) # L0 + stl $23, 8($16) # L1 + addl $12, $21, $0 # U0 hi mul + carry + + ldl $9, 8($30) + ldl $10, 16($30) + ldl $11, 24($30) + ldl $12, 32($30) + ldl $13, 40($30) + ldl $14, 48($30) + ldl $15, 56($30) + ldi $30, 240($30) + ret $31, ($26), 1 + + .end __mpn_addmul_1 diff --git a/sysdeps/sw_64/sw6a/lshift.S b/sysdeps/sw_64/sw6a/lshift.S new file mode 100644 index 00000000..cc00593c --- /dev/null +++ b/sysdeps/sw_64/sw6a/lshift.S @@ -0,0 +1,172 @@ + # Sw_64 __mpn_lshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the sw_64. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addl $18,$17,$17 # make r17 point at end of s1 + ldl $4,-8($17) # load first limb + subl $31,$19,$20 + s8addl $18,$16,$16 # make r16 point at end of RES + subl $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + srl $4,$20,$0 # compute function result + + beq $28,.L0 + subl $18,$28,$18 + + .align 3 +.Loop0: ldl $3,-16($17) + subl $16,8,$16 + sll $4,$19,$5 + subl $17,8,$17 + subl $28,1,$28 + srl $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stl $8,0($16) + bne $28,.Loop0 + +.L0: sll $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldl $1,-16($17) + subl $18,4,$18 + ldl $2,-24($17) + ldl $3,-32($17) + ldl $4,-40($17) + beq $18,.Lend1 + # warm up phase 2 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + ldl $1,-48($17) + sll $2,$19,$22 + ldl $2,-56($17) + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + ldl $3,-64($17) + sll $4,$19,$24 + ldl $4,-72($17) + subl $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + + srl $1,$20,$7 + subl $18,4,$18 + sll $1,$19,$21 + unop # ldl $31,-96($17) + + srl $2,$20,$8 + ldl $1,-80($17) + sll $2,$19,$22 + ldl $2,-88($17) + + stl $5,-24($16) + or $7,$24,$7 + stl $6,-32($16) + or $8,$21,$8 + + srl $3,$20,$5 + unop # ldl $31,-96($17) + sll $3,$19,$23 + subl $16,32,$16 + + srl $4,$20,$6 + ldl $3,-96($17) + sll $4,$19,$24 + ldl $4,-104($17) + + subl $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + stl $5,-24($16) + or $7,$24,$7 + stl $6,-32($16) + or $8,$21,$8 + srl $3,$20,$5 + sll $3,$19,$23 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 2/2 + stl $7,-40($16) + or $5,$22,$5 + stl $8,-48($16) + or $6,$23,$6 + stl $5,-56($16) + stl $6,-64($16) + # cool down phase 2/3 + stl $24,-72($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 1/2 + stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + stl $5,-24($16) + stl $6,-32($16) + stl $24,-40($16) + ret $31,($26),1 + +.Lend: stl $24,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/sw_64/sw6a/rshift.S b/sysdeps/sw_64/sw6a/rshift.S new file mode 100644 index 00000000..416c3903 --- /dev/null +++ b/sysdeps/sw_64/sw6a/rshift.S @@ -0,0 +1,170 @@ + # Sw_64 __mpn_rshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the sw_64. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldl $4,0($17) # load first limb + subl $31,$19,$20 + subl $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + sll $4,$20,$0 # compute function result + + beq $28,.L0 + subl $18,$28,$18 + + .align 3 +.Loop0: ldl $3,8($17) + addl $16,8,$16 + srl $4,$19,$5 + addl $17,8,$17 + subl $28,1,$28 + sll $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stl $8,-8($16) + bne $28,.Loop0 + +.L0: srl $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldl $1,8($17) + subl $18,4,$18 + ldl $2,16($17) + ldl $3,24($17) + ldl $4,32($17) + beq $18,.Lend1 + # warm up phase 2 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + ldl $1,40($17) + srl $2,$19,$22 + ldl $2,48($17) + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + ldl $3,56($17) + srl $4,$19,$24 + ldl $4,64($17) + subl $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + + sll $1,$20,$7 + subl $18,4,$18 + srl $1,$19,$21 + unop # ldl $31,-96($17) + + sll $2,$20,$8 + ldl $1,72($17) + srl $2,$19,$22 + ldl $2,80($17) + + stl $5,16($16) + or $7,$24,$7 + stl $6,24($16) + or $8,$21,$8 + + sll $3,$20,$5 + unop # ldl $31,-96($17) + srl $3,$19,$23 + addl $16,32,$16 + + sll $4,$20,$6 + ldl $3,88($17) + srl $4,$19,$24 + ldl $4,96($17) + + addl $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + stl $5,16($16) + or $7,$24,$7 + stl $6,24($16) + or $8,$21,$8 + sll $3,$20,$5 + srl $3,$19,$23 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 2/2 + stl $7,32($16) + or $5,$22,$5 + stl $8,40($16) + or $6,$23,$6 + stl $5,48($16) + stl $6,56($16) + # cool down phase 2/3 + stl $24,64($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 1/2 + stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + stl $5,16($16) + stl $6,24($16) + stl $24,32($16) + ret $31,($26),1 + +.Lend: stl $24,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/sw_64/sw6a/sub_n.S b/sysdeps/sw_64/sw6a/sub_n.S new file mode 100644 index 00000000..95c257f7 --- /dev/null +++ b/sysdeps/sw_64/sw6a/sub_n.S @@ -0,0 +1,147 @@ + # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subl $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldl $0,0($18) + ldl $1,8($18) + ldl $4,0($17) + ldl $5,8($17) + addl $17,32,$17 # update s1_ptr + ldl $2,16($18) + subl $4,$0,$20 # 1st main sub + ldl $3,24($18) + subl $19,4,$19 # decr loop cnt + ldl $6,-16($17) + cmpult $4,$20,$25 # compute cy from last sub + ldl $7,-8($17) + addl $1,$25,$28 # cy add + addl $18,32,$18 # update s2_ptr + subl $5,$28,$21 # 2nd main sub + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $5,$21,$25 # compute cy from last add + ldl $0,0($18) + or $8,$25,$25 # combine cy from the two fadds + ldl $1,8($18) + addl $2,$25,$28 # cy add + ldl $4,0($17) + subl $6,$28,$22 # 3rd main sub + ldl $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + subl $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + addl $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + addl $0,$25,$28 # cy add + ldl $2,16($18) + subl $4,$28,$20 # 1st main sub + ldl $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldl $6,-16($17) + cmpult $4,$20,$25 # compute cy from last add + ldl $7,-8($17) + or $8,$25,$25 # combine cy from the two fadds + subl $19,4,$19 # decr loop cnt + stl $22,-16($16) + addl $1,$25,$28 # cy add + stl $23,-8($16) + subl $5,$28,$21 # 2nd main sub + addl $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $5,$21,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $2,$25,$28 # cy add + subl $6,$28,$22 # 3rd main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + subl $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + stl $22,-16($16) + stl $23,-8($16) +.Lend2: addl $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldl $0,0($18) + ldl $4,0($17) + subl $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addl $0,$25,$28 # cy add + ldl $0,8($18) + subl $4,$28,$20 # main sub + ldl $1,8($17) + addl $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addl $17,8,$17 + stl $20,0($16) + cmpult $4,$20,$25 # compute cy from last add + subl $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two fadds + addl $16,8,$16 + or $1,$31,$4 + bne $19,.Loop0 +.Lend0: addl $0,$25,$28 # cy add + subl $4,$28,$20 # main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $4,$20,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_sub_n diff --git a/sysdeps/sw_64/sw6b/add_n.S b/sysdeps/sw_64/sw6b/add_n.S new file mode 100644 index 00000000..86e9f9ae --- /dev/null +++ b/sysdeps/sw_64/sw6b/add_n.S @@ -0,0 +1,146 @@ + # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subl $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldl $0,0($18) + ldl $1,8($18) + ldl $4,0($17) + ldl $5,8($17) + addl $17,32,$17 # update s1_ptr + ldl $2,16($18) + addl $0,$4,$20 # 1st main add + ldl $3,24($18) + subl $19,4,$19 # decr loop cnt + ldl $6,-16($17) + cmpult $20,$0,$25 # compute cy from last add + ldl $7,-8($17) + addl $1,$25,$28 # cy add + addl $18,32,$18 # update s2_ptr + addl $5,$28,$21 # 2nd main add + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $21,$28,$25 # compute cy from last add + ldl $0,0($18) + or $8,$25,$25 # combine cy from the two fadds + ldl $1,8($18) + addl $2,$25,$28 # cy add + ldl $4,0($17) + addl $28,$6,$22 # 3rd main add + ldl $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + addl $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + addl $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + addl $0,$25,$28 # cy add + ldl $2,16($18) + addl $4,$28,$20 # 1st main add + ldl $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldl $6,-16($17) + cmpult $20,$28,$25 # compute cy from last add + ldl $7,-8($17) + or $8,$25,$25 # combine cy from the two fadds + subl $19,4,$19 # decr loop cnt + stl $22,-16($16) + addl $1,$25,$28 # cy add + stl $23,-8($16) + addl $5,$28,$21 # 2nd main add + addl $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $21,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $2,$25,$28 # cy add + addl $28,$6,$22 # 3rd main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + addl $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + stl $22,-16($16) + stl $23,-8($16) +.Lend2: addl $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldl $0,0($18) + ldl $4,0($17) + subl $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addl $0,$25,$28 # cy add + ldl $0,8($18) + addl $4,$28,$20 # main add + ldl $4,8($17) + addl $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addl $17,8,$17 + stl $20,0($16) + cmpult $20,$28,$25 # compute cy from last add + subl $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two fadds + addl $16,8,$16 + bne $19,.Loop0 +.Lend0: addl $0,$25,$28 # cy add + addl $4,$28,$20 # main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $20,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_add_n diff --git a/sysdeps/sw_64/sw6b/addmul_1.S b/sysdeps/sw_64/sw6b/addmul_1.S new file mode 100644 index 00000000..a288f040 --- /dev/null +++ b/sysdeps/sw_64/sw6b/addmul_1.S @@ -0,0 +1,475 @@ + # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + # + # Copyright (C) 2000-2023 Free Software Foundation, Inc. + # + # This file is part of the GNU MP Library. + # + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published + # by the Free Software Foundation; either version 2.1 of the License, or (at + # your option) any later version. + # + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + # + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # size $18 + # s2_limb $19 + # + # + # This code was written in close cooperation with pipeline expert + # . Any errors are tege's fault, though. + # + # Register usages for unrolled loop: + # 0-3 mul's + # 4-7 acc's + # 8-15 mul results + # 20,21 carry's + # 22,23 save for stores + # + # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop. + # + # The stores can issue a cycle late so we have paired no-op's to 'catch' + # them, so that further disturbance to the schedule is damped. + # + # We couldn't pair the loads, because the entangled schedule of the + # carry's has to happen on one side {0} of the machine. Note, the total + # use of U0, and the total use of L0 (after attending to the stores). + # which is part of the reason why.... + # + # This is a great schedule for the d_cache, a poor schedule for the + # b_cache. The lockup on U0 means that any stall can't be recovered + # from. Consider a ldl in L1. say that load gets stalled because it + # collides with a fill from the b_Cache. On the next cycle, this load + # gets priority. If first looks at L0, and goes there. The instruction + # we intended for L0 gets to look at L1, which is NOT where we want + # it. It either stalls 1, because it can't go in L0, or goes there, and + # causes a further instruction to stall. + # + # So for b_cache, we're likely going to want to put one or more cycles + # back into the code! And, of course, put in prefetches. For the + # accumulator, flds, intent to modify. For the fmuldiplier, you might + # want ldl, evict next, if you're not wanting to use it again soon. Use + # 256 ahead of present pointer value. At a place where we have an mt + # followed by a bookkeeping, put the bookkeeping in upper, and the + # prefetch into lower. + # + # Note, the usage of physical registers per cycle is smoothed off, as + # much as possible. + # + # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd + # like not to have a ldl or stl to preceded a conditional branch in a + # quadpack. The conditional branch moves the retire pointer one cycle + # later. + # + # Optimization notes: + # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? + # Reserved regs: $29 $30 $31 + # Free caller-saves regs in unrolled code: $24 $25 $28 + # We should swap some of the callee-saves regs for some of the free + # caller-saves regs, saving some overhead cycles. + # Most importantly, we should write fast code for the 0-7 case. + # The code we use there are for the 21164, and runs at 7 cycles/limb + # on the 21264. Should not be hard, if we write specialized code for + # 1-7 limbs (the one for 0 limbs should be straightforward). We then just + # need a jump table indexed by the low 3 bits of the count argument. + + .set noreorder + .set noat + .text + + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .frame $30,0,$26,0 + .prologue 0 + + cmpult $18, 8, $1 + beq $1, $Large + + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $18, 1, $18 # size-- + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $18, $Lend0b # jump if size was == 1 + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $18, 1, $18 # size-- + addl $5, $3, $3 + cmpult $3, $5, $4 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + beq $18, $Lend0a # jump if size was == 2 + + .align 3 +$Loop0: mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + subl $18, 1, $18 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + bne $18, $Loop0 +$Lend0a: + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $5, $0, $0 # combine carries + addl $4, $0, $0 # cy_limb = prod_high + cy + ret $31, ($26), 1 +$Lend0b: + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $0, $5, $0 + ret $31, ($26), 1 + +$Large: + ldi $30, -240($30) + stl $9, 8($30) + stl $10, 16($30) + stl $11, 24($30) + stl $12, 32($30) + stl $13, 40($30) + stl $14, 48($30) + stl $15, 56($30) + + and $18, 7, $20 # count for the first loop, 0-7 + srl $18, 3, $18 # count for unrolled loop + bis $31, $31, $0 + beq $20, $Lunroll + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $20, 1, $20 # size-- + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $20, $Lend1b # jump if size was == 1 + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $20, 1, $20 # size-- + addl $5, $3, $3 + cmpult $3, $5, $4 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + beq $20, $Lend1a # jump if size was == 2 + + .align 3 +$Loop1: mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + subl $20, 1, $20 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + bne $20, $Loop1 + +$Lend1a: + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + addl $4, $0, $0 # cy_limb = prod_high + cy + br $31, $Lunroll +$Lend1b: + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $0, $5, $0 + +$Lunroll: + ldi $17, -16($17) # L1 bookkeeping + ldi $16, -16($16) # L1 bookkeeping + bis $0, $31, $12 + + # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldl $2, 16($17) # L1 + ldl $3, 24($17) # L1 + ldi $18, -1($18) # L1 bookkeeping + ldl $6, 16($16) # L1 + ldl $7, 24($16) # L1 + ldl $0, 32($17) # L1 + mull $19, $2, $13 # U1 + ldl $1, 40($17) # L1 + umulh $19, $2, $14 # U1 + mull $19, $3, $15 # U1 + ldi $17, 64($17) # L1 bookkeeping + ldl $4, 32($16) # L1 + ldl $5, 40($16) # L1 + umulh $19, $3, $8 # U1 + ldl $2, -16($17) # L1 + mull $19, $0, $9 # U1 + ldl $3, -8($17) # L1 + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + mull $19, $1, $11 # U1 + cmpult $6, $13, $20 # L0 lo add => carry + ldi $16, 64($16) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, -16($16) # L1 + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, -8($16) # L1 + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 0($17) # L1 + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 8($17) # L1 + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # L0 lo + acc + stl $22, -48($16) # L0 + stl $23, -40($16) # L1 + mull $19, $3, $15 # U1 + addl $8, $21, $8 # U0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + ble $18, $Lend # U1 bookkeeping + + # ____ MAIN UNROLLED LOOP ____ + .align 4 +$Loop: + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 0($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 8($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + ldl $2, 16($17) # L1 + + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + ldl $3, 24($17) # L1 + + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, -32($16) # L0 + stl $23, -24($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addl $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $18, -1($18) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, 16($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, 24($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 32($17) # L1 + + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 40($17) # L1 + + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # U0 lo + acc + stl $22, -16($16) # L0 + stl $23, -8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addl $8, $21, $8 # L0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $17, 64($17) # L1 bookkeeping + addl $4, $8, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 32($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 40($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + ldl $2, -16($17) # L1 + + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + ldl $3, -8($17) # L1 + + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, 0($16) # L0 + stl $23, 8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addl $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $16, 64($16) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, -16($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, -8($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 0($17) # L1 + + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 8($17) # L1 + + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # L0 lo + acc + stl $22, -48($16) # L0 + stl $23, -40($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addl $8, $21, $8 # U0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + bis $31, $31, $31 # L1 mt + bgt $18, $Loop # U1 bookkeeping + +# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 0($16) # L1 + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 8($16) # L1 + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, -32($16) # L0 + stl $23, -24($16) # L1 + mull $19, $1, $11 # U1 + addl $12, $21, $12 # U0 hi mul + carry + cmpult $6, $13, $20 # L0 lo add => carry + addl $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + addl $4, $9, $4 # U0 lo + acc + stl $22, -16($16) # L0 + stl $23, -8($16) # L1 + bis $31, $31, $31 # L0 st slosh + addl $8, $21, $8 # L0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + stl $22, 0($16) # L0 + stl $23, 8($16) # L1 + addl $12, $21, $0 # U0 hi mul + carry + + ldl $9, 8($30) + ldl $10, 16($30) + ldl $11, 24($30) + ldl $12, 32($30) + ldl $13, 40($30) + ldl $14, 48($30) + ldl $15, 56($30) + ldi $30, 240($30) + ret $31, ($26), 1 + + .end __mpn_addmul_1 diff --git a/sysdeps/sw_64/sw6b/lshift.S b/sysdeps/sw_64/sw6b/lshift.S new file mode 100644 index 00000000..cc00593c --- /dev/null +++ b/sysdeps/sw_64/sw6b/lshift.S @@ -0,0 +1,172 @@ + # Sw_64 __mpn_lshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the sw_64. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addl $18,$17,$17 # make r17 point at end of s1 + ldl $4,-8($17) # load first limb + subl $31,$19,$20 + s8addl $18,$16,$16 # make r16 point at end of RES + subl $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + srl $4,$20,$0 # compute function result + + beq $28,.L0 + subl $18,$28,$18 + + .align 3 +.Loop0: ldl $3,-16($17) + subl $16,8,$16 + sll $4,$19,$5 + subl $17,8,$17 + subl $28,1,$28 + srl $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stl $8,0($16) + bne $28,.Loop0 + +.L0: sll $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldl $1,-16($17) + subl $18,4,$18 + ldl $2,-24($17) + ldl $3,-32($17) + ldl $4,-40($17) + beq $18,.Lend1 + # warm up phase 2 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + ldl $1,-48($17) + sll $2,$19,$22 + ldl $2,-56($17) + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + ldl $3,-64($17) + sll $4,$19,$24 + ldl $4,-72($17) + subl $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + + srl $1,$20,$7 + subl $18,4,$18 + sll $1,$19,$21 + unop # ldl $31,-96($17) + + srl $2,$20,$8 + ldl $1,-80($17) + sll $2,$19,$22 + ldl $2,-88($17) + + stl $5,-24($16) + or $7,$24,$7 + stl $6,-32($16) + or $8,$21,$8 + + srl $3,$20,$5 + unop # ldl $31,-96($17) + sll $3,$19,$23 + subl $16,32,$16 + + srl $4,$20,$6 + ldl $3,-96($17) + sll $4,$19,$24 + ldl $4,-104($17) + + subl $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + stl $5,-24($16) + or $7,$24,$7 + stl $6,-32($16) + or $8,$21,$8 + srl $3,$20,$5 + sll $3,$19,$23 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 2/2 + stl $7,-40($16) + or $5,$22,$5 + stl $8,-48($16) + or $6,$23,$6 + stl $5,-56($16) + stl $6,-64($16) + # cool down phase 2/3 + stl $24,-72($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 1/2 + stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + stl $5,-24($16) + stl $6,-32($16) + stl $24,-40($16) + ret $31,($26),1 + +.Lend: stl $24,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/sw_64/sw6b/memcpy.S b/sysdeps/sw_64/sw6b/memcpy.S new file mode 100644 index 00000000..938ebdfc --- /dev/null +++ b/sysdeps/sw_64/sw6b/memcpy.S @@ -0,0 +1,416 @@ +/* Copyright (C) 2000-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + sw6 optimized by Rick Gorton . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* + * Much of the information about 21264 scheduling/coding comes from: + * Compiler Writer's Guide for the Sw_64 21264 + * abbreviated as 'CWG' in other comments here + * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html + * Scheduling notation: + * E - either cluster + * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 + * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 + * + * Temp usage notes: + * $0 - destination address + * $1,$2, - scratch + */ + +#include + + .arch ev6 + .set noreorder + .set noat + + .type $jmppointh,@object +$jumppointh: + .gprel32 $both_0mod8 + .gprel32 J$H01 + .gprel32 J$H02 + .gprel32 J$H03 + .gprel32 J$H04 + .gprel32 J$H05 + .gprel32 J$H06 + .gprel32 J$H07 + +ENTRY(memcpy) + .prologue 1 + ldgp $29, 0($27) + mov $16, $0 # E : copy dest to return + ble $18, $nomoredata # U : done with the copy? + cmplt $18, 8, $1 + bne $1, $less_8 + xor $16, $17, $1 # E : are source and dest alignments the same? + and $1, 7, $1 # E : are they the same mod 8? + + bne $1, $misaligned # U : Nope - gotta do this the slow way + /* source and dest are same mod 8 address */ + and $16, 7, $1 # E : Are both 0mod8? + beq $1, $both_0mod8 # U : Yes + nop # E : + + /* + * source and dest are same misalignment. move a byte at a time + * until a 0mod8 alignment for both is reached. + * At least one byte more to move + */ + + ldi $2, 8 + subl $2, $1, $1 + +$head_align: + addl $16, $1, $16 + addl $17, $1, $17 + subl $18, $1, $18 + ldih $2, $jumppointh($29) !gprelhigh + s4addl $1, $2, $2 + ldw $2, $jumppointh($2) !gprellow + addl $2, $29, $2 + jmp ($2) + +$both_0mod8: + cmple $18, 127, $1 # E : Can we unroll the loop? + bne $1, $no_unroll # U : + and $16, 63, $1 # E : get mod64 alignment + beq $1, $do_unroll # U : no single quads to fiddle + +$single_head_quad: + ldl $1, 0($17) # L : get 8 bytes + subl $18, 8, $18 # E : count -= 8 + addl $17, 8, $17 # E : src += 8 + nop # E : + + stl $1, 0($16) # L : store + addl $16, 8, $16 # E : dest += 8 + and $16, 63, $1 # E : get mod64 alignment + bne $1, $single_head_quad # U : still not fully aligned + +$do_unroll: + ldih $1, 8($31) # big than 512K + cmple $18, $1, $1 + beq $1, $unroll_body_512 + nop + nop + cmple $18, 63, $1 # E : Can we go through the unrolled loop? + bne $1, $tail_quads # U : Nope + nop # E : + +$unroll_body: + ldl $6, 0($17) # L0 : bytes 0..7 + nop # E : + nop # E : + + ldl $4, 8($17) # L : bytes 8..15 + ldl $5, 16($17) # L : bytes 16..23 + nop # E : + nop # E : + + ldl $3, 24($17) # L : bytes 24..31 + addl $16, 64, $1 # E : fallback value for wh64 + nop # E : + nop # E : + + addl $17, 32, $17 # E : src += 32 bytes + stl $6, 0($16) # L : bytes 0..7 + nop # E : + nop # E : + + stl $4, 8($16) # L : bytes 8..15 + stl $5, 16($16) # L : bytes 16..23 + subl $18, 192, $2 # E : At least two more trips to go? + nop # E : + + stl $3, 24($16) # L : bytes 24..31 + addl $16, 32, $16 # E : dest += 32 bytes + nop # E : + nop # E : + + ldl $6, 0($17) # L : bytes 0..7 + ldl $4, 8($17) # L : bytes 8..15 + # fallback wh64 address if < 2 more trips + nop # E : + nop # E : + + ldl $5, 16($17) # L : bytes 16..23 + ldl $3, 24($17) # L : bytes 24..31 + addl $16, 32, $16 # E : dest += 32 + subl $18, 64, $18 # E : count -= 64 + + addl $17, 32, $17 # E : src += 32 + stl $6, -32($16) # L : bytes 0..7 + stl $4, -24($16) # L : bytes 8..15 + cmple $18, 63, $1 # E : At least one more trip? + + stl $5, -16($16) # L : bytes 16..23 + stl $3, -8($16) # L : bytes 24..31 + nop # E : + beq $1, $unroll_body + nop + nop + nop + br $tail_quads + +$unroll_body_512: + fillcs 128*4($17) + e_fillcs 128*20($17) + + fillcs 128*3($16) #add by ZJ20220620 stl_nc->stl + e_fillcs 128*7($16) + + ldl $6, 0($17) # L0 : bytes 0..7 + nop # E : + nop # E : + + ldl $4, 8($17) # L : bytes 8..15 + ldl $5, 16($17) # L : bytes 16..23 + nop # E : + nop # E : + + ldl $3, 24($17) # L : bytes 24..31 + addl $16, 64, $1 # E : fallback value for wh64 + nop # E : + nop # E : + + addl $17, 32, $17 # E : src += 32 bytes + stl $6, 0($16) # L : bytes 0..7 + nop # E : + nop # E : + + stl $4, 8($16) # L : bytes 8..15 + stl $5, 16($16) # L : bytes 16..23 + subl $18, 192, $2 # E : At least two more trips to go? + nop # E : + + stl $3, 24($16) # L : bytes 24..31 + addl $16, 32, $16 # E : dest += 32 bytes + nop # E : + nop # E : + + ldl $6, 0($17) # L : bytes 0..7 + ldl $4, 8($17) # L : bytes 8..15 + # fallback wh64 address if < 2 more trips + nop # E : + nop # E : + + ldl $5, 16($17) # L : bytes 16..23 + ldl $3, 24($17) # L : bytes 24..31 + addl $16, 32, $16 # E : dest += 32 + subl $18, 64, $18 # E : count -= 64 + + addl $17, 32, $17 # E : src += 32 + stl $6, -32($16) # L : bytes 0..7 + stl $4, -24($16) # L : bytes 8..15 + cmple $18, 63, $1 # E : At least one more trip? + + stl $5, -16($16) # L : bytes 16..23 + stl $3, -8($16) # L : bytes 24..31 + nop # E : + beq $1, $unroll_body_512 + +$tail_quads: +$no_unroll: + .align 4 + subl $18, 8, $18 # E : At least a quad left? + blt $18, $less_than_8 # U : Nope + nop # E : + nop # E : + +$move_a_quad: + ldl $1, 0($17) # L : fetch 8 + subl $18, 8, $18 # E : count -= 8 + addl $17, 8, $17 # E : src += 8 + nop # E : + + stl $1, 0($16) # L : store 8 + addl $16, 8, $16 # E : dest += 8 + bge $18, $move_a_quad # U : + nop # E : + +$less_than_8: + .align 4 + addl $18, 8, $18 # E : add back for trailing bytes + ble $18, $nomoredata # U : All-done + nop # E : + nop # E : + + /* Trailing bytes */ +$tail_bytes: + subl $18, 1, $18 # E : count-- + ldbu $1, 0($17) # L : fetch a byte + addl $17, 1, $17 # E : src++ + nop # E : + + stb $1, 0($16) # L : store a byte + addl $16, 1, $16 # E : dest++ + bgt $18, $tail_bytes # U : more to be done? + nop # E : + + /* branching to exit takes 3 extra cycles, so replicate exit here */ + ret $31, ($26), 1 # L0 : + nop # E : + nop # E : + nop # E : + +$misaligned: + mov $0, $4 # E : dest temp + and $0, 7, $1 # E : dest alignment mod8 + beq $1, $dest_0mod8 # U : life doesnt totally suck + nop + +$aligndest: + ble $18, $nomoredata # U : + ldbu $1, 0($17) # L : fetch a byte + subl $18, 1, $18 # E : count-- + addl $17, 1, $17 # E : src++ + + stb $1, 0($4) # L : store it + addl $4, 1, $4 # E : dest++ + and $4, 7, $1 # E : dest 0mod8 yet? + bne $1, $aligndest # U : go until we are aligned. + + /* Source has unknown alignment, but dest is known to be 0mod8 */ +$dest_0mod8: + subl $18, 8, $18 # E : At least a quad left? + blt $18, $misalign_tail # U : Nope + ldl_u $3, 0($17) # L : seed (rotating load) of 8 bytes + ldih $1, 8($31) + subl $1, 8, $1 + cmple $18, $1, $1 + beq $1, $mis_quad_big # big than 512K + +$mis_quad: + ldl_u $16, 8($17) # L : Fetch next 8 + ext3b $3, $17, $3 # U : masking + ext7b $16, $17, $1 # U : masking + bis $3, $1, $1 # E : merged bytes to store + + subl $18, 8, $18 # E : count -= 8 + addl $17, 8, $17 # E : src += 8 + stl $1, 0($4) # L : store 8 (aligned) + mov $16, $3 # E : "rotate" source data + + addl $4, 8, $4 # E : dest += 8 + bge $18, $mis_quad # U : More quads to move + nop + nop + nop + br $misalign_tail + +$mis_quad_big: + fillcs 128*4($17) + e_fillcs 128*20($17) + ldl_u $16, 8($17) # L : Fetch next 8 + ext3b $3, $17, $3 # U : masking + ext7b $16, $17, $1 # U : masking + bis $3, $1, $1 # E : merged bytes to store + + fillcs 128*9($17) #add by ZJ20220620 stl_nc->stl + e_fillcs 128*15($17) + + subl $18, 8, $18 # E : count -= 8 + addl $17, 8, $17 # E : src += 8 + stl $1, 0($4) # L : store 8 (aligned) + mov $16, $3 # E : "rotate" source data + + addl $4, 8, $4 # E : dest += 8 + bge $18, $mis_quad_big # U : More quads to move + nop + nop + +$misalign_tail: + addl $18, 8, $18 # E : account for tail stuff + ble $18, $nomoredata # U : + nop + nop + +$misalign_byte: + ldbu $1, 0($17) # L : fetch 1 + subl $18, 1, $18 # E : count-- + addl $17, 1, $17 # E : src++ + nop # E : + + stb $1, 0($4) # L : store + addl $4, 1, $4 # E : dest++ + bgt $18, $misalign_byte # U : more to go? + nop + br $nomoredata + +$less_8: + ldbu $1, 0($17) # L : fetch 1 + subl $18, 1, $18 # E : count-- + addl $17, 1, $17 # E : src++ + nop # E : + + stb $1, 0($16) # L : store + addl $16, 1, $16 # E : dest++ + bgt $18, $less_8 # U : more to go? + nop + +$nomoredata: + ret $31, ($26), 1 # L0 : + nop # E : + nop # E : + nop # E : + +J$H01: + ldbu $1,-1($17) + stb $1,-1($16) + br $both_0mod8 + +J$H02: + ldh $1,-2($17) + sth $1,-2($16) + br $both_0mod8 + +J$H03: + ldh $1,-2($17) + ldbu $2,-3($17) + sth $1,-2($16) + stb $2,-3($16) + br $both_0mod8 + +J$H04: + ldw $1,-4($17) + stw $1,-4($16) + br $both_0mod8 + +J$H05: + ldw $1,-4($17) + ldbu $2,-5($17) + stw $1,-4($16) + stb $2,-5($16) + br $both_0mod8 + +J$H06: + ldw $1,-4($17) + ldh $2,-6($17) + stw $1,-4($16) + sth $2,-6($16) + br $both_0mod8 + +J$H07: + ldw $1,-4($17) + ldh $2,-6($17) + ldbu $3,-7($17) + stw $1,-4($16) + sth $2,-6($16) + stb $3,-7($16) + br $both_0mod8 + +END(memcpy) +libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/sw_64/sw6b/memset.S b/sysdeps/sw_64/sw6b/memset.S new file mode 100644 index 00000000..0085ac70 --- /dev/null +++ b/sysdeps/sw_64/sw6b/memset.S @@ -0,0 +1,312 @@ +/* Copyright (C) 2000-2023 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + SW6 optimized by Rick Gorton . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + + +#include + + .arch sw6b + .set noat + .set noreorder + +ENTRY(memset) +#ifdef PROF + ldgp gp, 0(pv) + ldi AT, _mcount + call AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + /* + * Serious stalling happens. The only way to mitigate this is to + * undertake a major re-write to interleave the constant materialization + * with other parts of the fall-through code. This is important, even + * though it makes maintenance tougher. + * Do this later. + */ + and $17, 255, $1 # E : 00000000000000ch + ins0b $17, 1, $2 # U : 000000000000ch00 + mov $16, $0 # E : return value + mov $17, $8 # E : Save the ch + ble $18, $end # U : zero length requested? + + addl $18, $16, $6 # E : max address to write to + or $1, $2, $17 # E : 000000000000chch + ins0b $1, 2, $3 # U : 0000000000ch0000 + ins0b $1, 3, $4 # U : 00000000ch000000 + + or $3, $4, $3 # E : 00000000chch0000 + ins1b $17, 4, $5 # U : 0000chch00000000 + xor $16, $6, $1 # E : will complete write be within one quadword? + ins1b $17, 6, $2 # U : chch000000000000 + + or $17, $3, $17 # E : 00000000chchchch + or $2, $5, $2 # E : chchchch00000000 + bic $1, 7, $1 # E : fit within a single quadword? + and $16, 7, $3 # E : Target addr misalignment + + or $17, $2, $17 # E : chchchchchchchch + beq $1, $within_quad # U : + nop # E : + beq $3, $aligned # U : target is 0mod8 + + /* + * Target address is misaligned, and won't fit within a quadword. + */ + +#ifdef pixman_error + /* if the addr is unaligned in multi-thread, this will cause thread + unsafty,so use stb to store the trailing bytes. */ + ldl_u $4, 0($16) # L : Fetch first partial + mov $16, $5 # E : Save the address + ins3b $17, $16, $2 # U : Insert new bytes + subl $3, 8, $3 # E : Invert (for addressing uses) + + addl $18, $3, $18 # E : $18 is new count ($3 is negative) + mask3b $4, $16, $4 # U : clear relevant parts of the quad + subl $16, $3, $16 # E : $16 is new aligned destination + or $2, $4, $1 # E : Final bytes + + nop + stl_u $1,0($5) # L : Store result + nop + nop +#else +$misaligned: + stb $8, 0($16) + subl $18, 1, $18 + beq $18, $end + addl $16, 1, $16 + and $16, 7, $3 # E : Target addr misalignment + bne $3, $misaligned +#endif + + .align 4 +$aligned: + /* + * We are now guaranteed to be quad aligned, with at least + * one partial quad to write. + */ + + sra $18, 3, $3 # U : Number of remaining quads to write + and $18, 7, $18 # E : Number of trailing bytes to write + mov $16, $5 # E : Save dest address + beq $3, $no_quad # U : tail stuff only + + /* + * It's worth the effort to unroll this and use wh64 if possible. + * At this point, entry values are: + * $16 Current destination address + * $5 A copy of $16 + * $6 The max quadword address to write to + * $18 Number trailer bytes + * $3 Number quads to write + */ +# and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) + and $16, 0x1f, $2 # E : Forward work (only useful for unrolled loop) + subl $3, 16, $4 # E : Only try to unroll if > 128 bytes + subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) + blt $4, $loop # U : + + /* + * We know we've got at least 16 quads, minimum of one trip + * through unrolled loop. Do a quad at a time to get us 0mod64 + * aligned. + */ + + nop # E : + nop # E : + nop # E : +# beq $1, $bigalign # U : + beq $2, $bigalign # U : +$alignmod32: + stl $17, 0($5) # L : + subl $3, 1, $3 # E : For consistency later + addl $1, 8, $1 # E : Increment towards zero for alignment +# addl $5, 8, $4 # E : Initial wh64 address (filler instruction) + + nop + nop + addl $5, 8, $5 # E : Inc address + blt $1, $alignmod32 # U : + + +$bigalign: + ldih $1, 8($31) # big than 512KB + cmple $18, $1, $1 + beq $1, $do_wh64_512 + + /* + * $3 - number quads left to go + * $5 - target address (aligned 0mod64) + * $17 - mask of stuff to store + * Scratch registers available: $7, $2, $4, $1 + * We know that we'll be taking a minimum of one trip through. + * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle + * Assumes the wh64 needs to be for 2 trips through the loop in the + * future.The wh64 is issued on for the starting destination address for + * trip +2 through the loop, and if there are less than two trips left, + * the target address will be for the current trip. */ + +$do_wh64: +# wh64 ($4) # L1 : memory subsystem write hint + subl $3, 24, $2 # E : For determining future wh64 addresses + stl $17, 0($5) # L : + nop # E : + +# addl $5, 128, $4 # E : speculative target of next wh64 + stl $17, 8($5) # L : + stl $17, 16($5) # L : + addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) + + stl $17, 24($5) # L : + stl $17, 32($5) # L : +# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle + nop + + stl $17, 40($5) # L : + stl $17, 48($5) # L : + subl $3, 16, $2 # E : Repeat the loop at least once more? + nop + + stl $17, 56($5) # L : + addl $5, 64, $5 # E : + subl $3, 8, $3 # E : + bge $2, $do_wh64 # U : + + nop + nop + nop + beq $3, $no_quad # U : Might have finished already + + nop + nop + nop + br $loop # U : Might have finished already + +$do_wh64_512: +# wh64 ($4) # L1 : memory subsystem write hint + subl $3, 24, $2 # E : For determining future wh64 addresses + + fillcs 128*1($5) + e_fillcs 128*5($5) + +# stl_nc $17, 0($5) # L : + stl $17, 0($5) # L : + nop # E : + +# addl $5, 128, $4 # E : speculative target of next wh64 +# stl_nc $17, 8($5) # L : + stl $17, 8($5) # L : +# stl_nc $17, 16($5) # L : + stl $17, 16($5) # L : + addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) + +# stl_nc $17, 24($5) # L : + stl $17, 24($5) # L : +# stl_nc $17, 32($5) # L : + stl $17, 32($5) # L : +# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle + nop + +# stl_nc $17, 40($5) # L : + stl $17, 40($5) # L : +# stl_nc $17, 48($5) # L : + stl $17, 48($5) # L : + subl $3, 16, $2 # E : Repeat the loop at least once more? + nop + +# stl_nc $17, 56($5) # L : + stl $17, 56($5) # L : + addl $5, 64, $5 # E : + subl $3, 8, $3 # E : + bge $2, $do_wh64_512 # U : + + nop + nop + nop + beq $3, $no_quad # U : Might have finished already + + .align 4 + /* + * Simple loop for trailing quadwords, or for small amounts + * of data (where we can't use an unrolled loop and wh64) + */ +$loop: + stl $17, 0($5) # L : + subl $3, 1, $3 # E : Decrement number quads left + addl $5, 8, $5 # E : Inc address + bne $3, $loop # U : more? + +$no_quad: + /* + * Write 0..7 trailing bytes. + */ + nop # E : + beq $18, $end # U : All done? + +#ifndef pixman_error +/* if the addr is unaligned in multi-thread, this will cause thread unsafty, + so use stb to store the trailing bytes. */ +$trailing: + stb $17, 0($5) + subl $18, 1, $18 + beq $18, $end + addl $5, 1, $5 + br $trailing +#else + ldl $7, 0($5) # L : + mask7b $7, $6, $2 # U : Mask final quad + + ins7b $17, $6, $4 # U : New bits + or $2, $4, $1 # E : Put it all together + stl $1, 0($5) # L : And back to memory + ret $31,($26),1 # L0 : +#endif + +$within_quad: +#ifdef PIXMAN_ERROR + /* if the addr is unaligned in multi-thread, this will cause thread + unsafty,so use stb to store the trailing bytes. */ + ldl_u $1, 0($16) # L : + ins3b $17, $16, $2 # U : New bits + mask3b $1, $16, $4 # U : Clear old + or $2, $4, $2 # E : New result + + mask3b $2, $6, $4 # U : + mask7b $1, $6, $2 # U : + or $2, $4, $1 # E : + stl_u $1, 0($16) # L : +#else + stb $8, 0($16) + subl $18, 1, $18 + beq $18, $end + addl $16, 1, $16 + br $within_quad +#endif + +$end: + nop + nop + nop + ret $31,($26),1 # L0 : + + END(memset) +libc_hidden_builtin_def (memset) diff --git a/sysdeps/sw_64/sw6b/rshift.S b/sysdeps/sw_64/sw6b/rshift.S new file mode 100644 index 00000000..ec2a78b0 --- /dev/null +++ b/sysdeps/sw_64/sw6b/rshift.S @@ -0,0 +1,170 @@ + # Sw_64 __mpn_rshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the sw_64. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldl $4,0($17) # load first limb + subl $31,$19,$20 + subl $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + sll $4,$20,$0 # compute function result + + beq $28,.L0 + subl $18,$28,$18 + + .align 3 +.Loop0: ldl $3,8($17) + addl $16,8,$16 + srl $4,$19,$5 + addl $17,8,$17 + subl $28,1,$28 + sll $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stl $8,-8($16) + bne $28,.Loop0 + +.L0: srl $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldl $1,8($17) + subl $18,4,$18 + ldl $2,16($17) + ldl $3,24($17) + ldl $4,32($17) + beq $18,.Lend1 + # warm up phase 2 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + ldl $1,40($17) + srl $2,$19,$22 + ldl $2,48($17) + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + ldl $3,56($17) + srl $4,$19,$24 + ldl $4,64($17) + subl $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + + sll $1,$20,$7 + subl $18,4,$18 + srl $1,$19,$21 + unop # ldl $31,-96($17) + + sll $2,$20,$8 + ldl $1,72($17) + srl $2,$19,$22 + ldl $2,80($17) + + stl $5,16($16) + or $7,$24,$7 + stl $6,24($16) + or $8,$21,$8 + + sll $3,$20,$5 + unop # ldl $31,-96($17) + srl $3,$19,$23 + addl $16,32,$16 + + sll $4,$20,$6 + ldl $3,88($17) + srl $4,$19,$24 + ldl $4,96($17) + + addl $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + stl $5,16($16) + or $7,$24,$7 + stl $6,24($16) + or $8,$21,$8 + sll $3,$20,$5 + srl $3,$19,$23 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 2/2 + stl $7,32($16) + or $5,$22,$5 + stl $8,40($16) + or $6,$23,$6 + stl $5,48($16) + stl $6,56($16) + # cool down phase 2/3 + stl $24,64($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 1/2 + stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + stl $5,16($16) + stl $6,24($16) + stl $24,32($16) + ret $31,($26),1 + +.Lend: stl $24,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/sw_64/sw6b/stxcpy.S b/sysdeps/sw_64/sw6b/stxcpy.S new file mode 100644 index 00000000..cf07eb8e --- /dev/null +++ b/sysdeps/sw_64/sw6b/stxcpy.S @@ -0,0 +1,314 @@ +/* Copyright (C) 2000-2023 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + SW6 optimized by Rick Gorton . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* Copy a null-terminated string from SRC to DST. + + This is an internal routine used by strcpy, stpcpy, and strcat. + As such, it uses special linkage conventions to make implementation + of these public functions more efficient. + + On input: + t9 = return address + a0 = DST + a1 = SRC + + On output: + t8 = bitmask (with one bit set) indicating the last byte written + a0 = unaligned address of the last *word* written + + Furthermore, v0, a3-a5, t11, and t12 are untouched. +*/ + + +#include + + .arch ev6 + .set noat + .set noreorder + + .text + .type __stxcpy, @function + .globl __stxcpy + .usepv __stxcpy, no + + cfi_startproc + cfi_return_column (t9) + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == the first source word. */ + .align 4 +stxcpy_aligned: + /* Create the 1st output word and detect 0's in the 1st input word. */ + ldi t2, -1 # E : build a mask against false zero + mask7b t2, a1, t2 # U : detection in the src word (stall) + mask7b t1, a1, t3 # U : + ornot t1, t2, t2 # E : (stall) + + mask3b t0, a1, t0 # U : assemble the first output word + cmpgeb zero, t2, t10 # E : bits set iff null found + or t0, t3, t1 # E : (stall) + bne t10, $a_eos # U : (stall) + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == a source word not containing a null. */ + /* Nops here to separate store quads from load quads */ + +$a_loop: + stl_u t1, 0(a0) # L : + addl a0, 8, a0 # E : + nop + nop + + ldl_u t1, 0(a1) # L : Latency=3 + addl a1, 8, a1 # E : + cmpgeb zero, t1, t10 # E : (3 cycle stall) + beq t10, $a_loop # U : (stall for t10) + + /* Take care of the final (partial) word store. + On entry to this basic block we have: + t1 == the source word containing the null + t10 == the cmpgeb mask that found it. */ +$a_eos: + negl t10, t6 # E : find low bit set + and t10, t6, t8 # E : (stall) + /* For the sake of the cache, don't read a destination word + if we're not going to need it. */ + and t8, 0x80, t6 # E : (stall) + bne t6, 1f # U : (stall) + + /* We're doing a partial word store and so need to combine + our source and original destination words. */ + ldl_u t0, 0(a0) # L : Latency=3 + subl t8, 1, t6 # E : + zapnot t1, t6, t1 # U : clear src bytes >= null (stall) + or t8, t6, t10 # E : (stall) + + zap t0, t10, t0 # E : clear dst bytes <= null + or t0, t1, t1 # E : (stall) + nop + nop + +1: stl_u t1, 0(a0) # L : + ret (t9) # L0 : Latency=3 + nop + nop + + .align 4 +__stxcpy: + /* Are source and destination co-aligned? */ + xor a0, a1, t0 # E : + unop # E : + and t0, 7, t0 # E : (stall) + bne t0, $unaligned # U : (stall) + + /* We are co-aligned; take care of a partial first word. */ + ldl_u t1, 0(a1) # L : load first src word + and a0, 7, t0 # E : take care not to load a word ... + addl a1, 8, a1 # E : + beq t0, stxcpy_aligned # U : ... if we wont need it (stall) + + ldl_u t0, 0(a0) # L : + br stxcpy_aligned # L0 : Latency=3 + nop + nop + + +/* The source and destination are not co-aligned. Align the destination + and cope. We have to be very careful about not reading too much and + causing a SEGV. */ + + .align 4 +$u_head: + /* We know just enough now to be able to assemble the first + full source word. We can still find a zero at the end of it + that prevents us from outputting the whole thing. + + On entry to this basic block: + t0 == the first dest word, for masking back in, if needed else 0 + t1 == the low bits of the first source word + t6 == bytemask that is -1 in dest word bytes */ + + ldl_u t2, 8(a1) # L : + addl a1, 8, a1 # E : + ext3b t1, a1, t1 # U : (stall on a1) + ext7b t2, a1, t4 # U : (stall on a1) + + mask3b t0, a0, t0 # U : + or t1, t4, t1 # E : + mask7b t1, a0, t1 # U : (stall on t1) + or t0, t1, t1 # E : (stall on t1) + + or t1, t6, t6 # E : + cmpgeb zero, t6, t10 # E : (stall) + ldi t6, -1 # E : for masking just below + bne t10, $u_final # U : (stall) + + mask3b t6, a1, t6 # U : mask out the bits we have + or t6, t2, t2 # E : already extracted before (stall) + cmpgeb zero, t2, t10 # E : testing eos (stall) + bne t10, $u_late_head_exit # U : (stall) + + /* Finally, we've got all the stupid leading edge cases taken care + of and we can set up to enter the main loop. */ + + stl_u t1, 0(a0) # L : store first output word + addl a0, 8, a0 # E : + ext3b t2, a1, t0 # U : position ho-bits of lo word + ldl_u t2, 8(a1) # U : read next high-order source word + + addl a1, 8, a1 # E : + cmpgeb zero, t2, t10 # E : (stall for t2) + nop # E : + bne t10, $u_eos # U : (stall) + + /* Unaligned copy main loop. In order to avoid reading too much, + the loop is structured to detect zeros in aligned source words. + This has, unfortunately, effectively pulled half of a loop + iteration out into the head and half into the tail, but it does + prevent nastiness from accumulating in the very thing we want + to run as fast as possible. + + On entry to this basic block: + t0 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word + + We further know that t2 does not contain a null terminator. */ + + .align 3 +$u_loop: + ext7b t2, a1, t1 # U : extract high bits for current word + addl a1, 8, a1 # E : (stall) + ext3b t2, a1, t3 # U : extract low bits for next time (stall) + addl a0, 8, a0 # E : + + or t0, t1, t1 # E : current dst word now complete + ldl_u t2, 0(a1) # L : Latency=3 load high word for next time + stl_u t1, -8(a0) # L : save the current word (stall) + mov t3, t0 # E : + + cmpgeb zero, t2, t10 # E : test new word for eos + beq t10, $u_loop # U : (stall) + nop + nop + + /* We've found a zero somewhere in the source word we just read. + If it resides in the lower half, we have one (probably partial) + word to write out, and if it resides in the upper half, we + have one full and one partial word left to write out. + + On entry to this basic block: + t0 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word. */ +$u_eos: + ext7b t2, a1, t1 # U : + or t0, t1, t1 # E : first (partial) source word complete (stall) + cmpgeb zero, t1, t10 # E : is the null in this first bit? (stall) + bne t10, $u_final # U : (stall) + +$u_late_head_exit: + stl_u t1, 0(a0) # L : the null was in the high-order bits + addl a0, 8, a0 # E : + ext3b t2, a1, t1 # U : + cmpgeb zero, t1, t10 # E : (stall) + + /* Take care of a final (probably partial) result word. + On entry to this basic block: + t1 == assembled source word + t10 == cmpgeb mask that found the null. */ +$u_final: + negl t10, t6 # E : isolate low bit set + and t6, t10, t8 # E : (stall) + and t8, 0x80, t6 # E : avoid dest word load if we can (stall) + bne t6, 1f # U : (stall) + + ldl_u t0, 0(a0) # E : + subl t8, 1, t6 # E : + or t6, t8, t10 # E : (stall) + zapnot t1, t6, t1 # U : kill source bytes >= null (stall) + + zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall) + or t0, t1, t1 # E : (stall) + nop + nop + +1: stl_u t1, 0(a0) # L : + ret (t9) # L0 : Latency=3 + nop + nop + + /* Unaligned copy entry point. */ + .align 4 +$unaligned: + + ldl_u t1, 0(a1) # L : load first source word + and a0, 7, t4 # E : find dest misalignment + and a1, 7, t5 # E : find src misalignment + /* Conditionally load the first destination word and a bytemask + with 0xff indicating that the destination byte is sacrosanct. */ + mov zero, t0 # E : + + mov zero, t6 # E : + beq t4, 1f # U : + ldl_u t0, 0(a0) # L : + ldi t6, -1 # E : + + mask3b t6, a0, t6 # U : + nop + nop + nop +1: + subl a1, t4, a1 # E : sub dest misalignment from src addr + /* If source misalignment is larger than dest misalignment, we need + extra startup checks to avoid SEGV. */ + cmplt t4, t5, t8 # E : + beq t8, $u_head # U : + ldi t2, -1 # E : mask out leading garbage in source + + mask7b t2, t5, t2 # U : + ornot t1, t2, t3 # E : (stall) + cmpgeb zero, t3, t10 # E : is there a zero? (stall) + beq t10, $u_head # U : (stall) + + /* At this point we've found a zero in the first partial word of + the source. We need to isolate the valid source data and mask + it into the original destination data. (Incidentally, we know + that we'll need at least one byte of that original dest word.) */ + + ldl_u t0, 0(a0) # L : + negl t10, t6 # E : build bitmask of bytes <= zero + and t6, t10, t8 # E : (stall) + and a1, 7, t5 # E : + + subl t8, 1, t6 # E : + or t6, t8, t10 # E : (stall) + srl t8, t5, t8 # U : adjust final null return value + zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall) + + and t1, t2, t1 # E : to source validity mask + ext3b t2, a1, t2 # U : + ext3b t1, a1, t1 # U : (stall) + andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) + + or t0, t1, t1 # e1 : and put it there + stl_u t1, 0(a0) # .. e0 : (stall) + ret (t9) # e1 : + + cfi_endproc diff --git a/sysdeps/sw_64/sw6b/stxncpy.S b/sysdeps/sw_64/sw6b/stxncpy.S new file mode 100644 index 00000000..c47029ea --- /dev/null +++ b/sysdeps/sw_64/sw6b/stxncpy.S @@ -0,0 +1,392 @@ +/* Copyright (C) 2000-2023 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + SW6 optimized by Rick Gorton . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* Copy no more than COUNT bytes of the null-terminated string from + SRC to DST. + + This is an internal routine used by strncpy, stpncpy, and strncat. + As such, it uses special linkage conventions to make implementation + of these public functions more efficient. + + On input: + t9 = return address + a0 = DST + a1 = SRC + a2 = COUNT + + Furthermore, COUNT may not be zero. + + On output: + t0 = last word written + t8 = bitmask (with one bit set) indicating the last byte written + t10 = bitmask (with one bit set) indicating the byte position of + the end of the range specified by COUNT + a0 = unaligned address of the last *word* written + a2 = the number of full words left in COUNT + + Furthermore, v0, a3-a5, t11, and t12 are untouched. +*/ + +#include + + .arch ev6 + .set noat + .set noreorder + + .text + .type __stxncpy, @function + .globl __stxncpy + .usepv __stxncpy, no + + cfi_startproc + cfi_return_column (t9) + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == the first source word. */ + .align 4 +stxncpy_aligned: + /* Create the 1st output word and detect 0's in the 1st input word. */ + ldi t2, -1 # E : build a mask against false zero + mask7b t2, a1, t2 # U : detection in the src word (stall) + mask7b t1, a1, t3 # U : + ornot t1, t2, t2 # E : (stall) + + mask3b t0, a1, t0 # U : assemble the first output word + cmpgeb zero, t2, t7 # E : bits set iff null found + or t0, t3, t0 # E : (stall) + beq a2, $a_eoc # U : + + bne t7, $a_eos # U : + nop + nop + nop + + /* On entry to this basic block: + t0 == a source word not containing a null. */ + + /* + * nops here to: + * separate store quads from load quads + * limit of 1 bcond/quad to permit training + */ +$a_loop: + stl_u t0, 0(a0) # L : + addl a0, 8, a0 # E : + subl a2, 1, a2 # E : + nop + + ldl_u t0, 0(a1) # L : + addl a1, 8, a1 # E : + cmpgeb zero, t0, t7 # E : + beq a2, $a_eoc # U : + + beq t7, $a_loop # U : + nop + nop + nop + + /* Take care of the final (partial) word store. At this point + the end-of-count bit is set in t7 iff it applies. + + On entry to this basic block we have: + t0 == the source word containing the null + t7 == the cmpgeb mask that found it. */ +$a_eos: + negl t7, t8 # E : find low bit set + and t7, t8, t8 # E : (stall) + /* For the sake of the cache, don't read a destination word + if we're not going to need it. */ + and t8, 0x80, t6 # E : (stall) + bne t6, 1f # U : (stall) + + /* We're doing a partial word store and so need to combine + our source and original destination words. */ + ldl_u t1, 0(a0) # L : + subl t8, 1, t6 # E : + or t8, t6, t7 # E : (stall) + zapnot t0, t7, t0 # U : clear src bytes > null (stall) + + zap t1, t7, t1 # .. e1 : clear dst bytes <= null + or t0, t1, t0 # e1 : (stall) + nop + nop + +1: stl_u t0, 0(a0) # L : + ret (t9) # L0 : Latency=3 + nop + nop + + /* Add the end-of-count bit to the eos detection bitmask. */ +$a_eoc: + or t10, t7, t7 # E : + br $a_eos # L0 : Latency=3 + nop + nop + + .align 4 +__stxncpy: + /* Are source and destination co-aligned? */ + ldi t2, -1 # E : + xor a0, a1, t1 # E : + and a0, 7, t0 # E : find dest misalignment + nop # E : + + srl t2, 1, t2 # U : + and t1, 7, t1 # E : + sellt a2, t2, a2, a2 # E : bound count to LONG_MAX (stall) + nop # E : + + addl a2, t0, a2 # E : bias count by dest misalignment + subl a2, 1, a2 # E : (stall) + and a2, 7, t2 # E : (stall) + ldi t10, 1 # E : + + srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 + sll t10, t2, t10 # U : t10 = bitmask of last count byte + nop # E : + bne t1, $unaligned # U : (stall) + + /* We are co-aligned; take care of a partial first word. */ + ldl_u t1, 0(a1) # L : load first src word + addl a1, 8, a1 # E : + beq t0, stxncpy_aligned # U : avoid loading dest word if not needed + ldl_u t0, 0(a0) # L : + + br stxncpy_aligned # U : + nop + nop + nop + + + +/* The source and destination are not co-aligned. Align the destination + and cope. We have to be very careful about not reading too much and + causing a SEGV. */ + + .align 4 +$u_head: + /* We know just enough now to be able to assemble the first + full source word. We can still find a zero at the end of it + that prevents us from outputting the whole thing. + + On entry to this basic block: + t0 == the first dest word, unmasked + t1 == the shifted low bits of the first source word + t6 == bytemask that is -1 in dest word bytes */ + + ldl_u t2, 8(a1) # L : Latency=3 load second src word + addl a1, 8, a1 # E : + mask3b t0, a0, t0 # U : mask trailing garbage in dst + ext7b t2, a1, t4 # U : (3 cycle stall on t2) + + or t1, t4, t1 # E : first aligned src word complete (stall) + mask7b t1, a0, t1 # U : mask leading garbage in src (stall) + or t0, t1, t0 # E : first output word complete (stall) + or t0, t6, t6 # E : mask original data for zero test (stall) + + cmpgeb zero, t6, t7 # E : + beq a2, $u_eocfin # U : + ldi t6, -1 # E : + nop + + bne t7, $u_final # U : + mask3b t6, a1, t6 # U : mask out bits already seen + stl_u t0, 0(a0) # L : store first output word + or t6, t2, t2 # E : + + cmpgeb zero, t2, t7 # E : find nulls in second partial + addl a0, 8, a0 # E : + subl a2, 1, a2 # E : + bne t7, $u_late_head_exit # U : + + /* Finally, we've got all the stupid leading edge cases taken care + of and we can set up to enter the main loop. */ + ext3b t2, a1, t1 # U : position hi-bits of lo word + beq a2, $u_eoc # U : + ldl_u t2, 8(a1) # L : read next high-order source word + addl a1, 8, a1 # E : + + ext7b t2, a1, t0 # U : position lo-bits of hi word (stall) + cmpgeb zero, t2, t7 # E : + nop + bne t7, $u_eos # U : + + /* Unaligned copy main loop. In order to avoid reading too much, + the loop is structured to detect zeros in aligned source words. + This has, unfortunately, effectively pulled half of a loop + iteration out into the head and half into the tail, but it does + prevent nastiness from accumulating in the very thing we want + to run as fast as possible. + + On entry to this basic block: + t0 == the shifted low-order bits from the current source word + t1 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word + + We further know that t2 does not contain a null terminator. */ + + .align 4 +$u_loop: + or t0, t1, t0 # E : current dst word now complete + subl a2, 1, a2 # E : decrement word count + ext3b t2, a1, t1 # U : extract high bits for next time + addl a0, 8, a0 # E : + + stl_u t0, -8(a0) # L : save the current word + beq a2, $u_eoc # U : + ldl_u t2, 8(a1) # L : Latency=3 load high word for next time + addl a1, 8, a1 # E : + + ext7b t2, a1, t0 # U : extract low bits (2 cycle stall) + cmpgeb zero, t2, t7 # E : test new word for eos + nop + beq t7, $u_loop # U : + + /* We've found a zero somewhere in the source word we just read. + If it resides in the lower half, we have one (probably partial) + word to write out, and if it resides in the upper half, we + have one full and one partial word left to write out. + + On entry to this basic block: + t0 == the shifted low-order bits from the current source word + t1 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word. */ +$u_eos: + or t0, t1, t0 # E : first (partial) source word complete + nop + cmpgeb zero, t0, t7 # E : is the null in this first bit? (stall) + bne t7, $u_final # U : (stall) + + stl_u t0, 0(a0) # L : the null was in the high-order bits + addl a0, 8, a0 # E : + subl a2, 1, a2 # E : + nop + +$u_late_head_exit: + ext3b t2, a1, t0 # U : + cmpgeb zero, t0, t7 # E : + or t7, t10, t6 # E : (stall) + seleq a2, t6, t7, t7 # E : Latency=2, extra map slot (stall) + + /* Take care of a final (probably partial) result word. + On entry to this basic block: + t0 == assembled source word + t7 == cmpgeb mask that found the null. */ +$u_final: + negl t7, t6 # E : isolate low bit set + and t6, t7, t8 # E : (stall) + and t8, 0x80, t6 # E : avoid dest word load if we can (stall) + bne t6, 1f # U : (stall) + + ldl_u t1, 0(a0) # L : + subl t8, 1, t6 # E : + or t6, t8, t7 # E : (stall) + zapnot t0, t7, t0 # U : kill source bytes > null + + zap t1, t7, t1 # U : kill dest bytes <= null + or t0, t1, t0 # E : (stall) + nop + nop + +1: stl_u t0, 0(a0) # L : + ret (t9) # L0 : Latency=3 + + /* Got to end-of-count before end of string. + On entry to this basic block: + t1 == the shifted high-order bits from the previous source word */ +$u_eoc: + and a1, 7, t6 # E : + sll t10, t6, t6 # U : (stall) + and t6, 0xff, t6 # E : (stall) + bne t6, 1f # U : (stall) + + ldl_u t2, 8(a1) # L : load final src word + nop + ext7b t2, a1, t0 # U : extract low bits for last word (stall) + or t1, t0, t1 # E : (stall) + +1: cmpgeb zero, t1, t7 # E : + mov t1, t0 + +$u_eocfin: # end-of-count, final word + or t10, t7, t7 # E : + br $u_final # L0 : Latency=3 + + /* Unaligned copy entry point. */ + .align 4 +$unaligned: + + ldl_u t1, 0(a1) # L : load first source word + and a0, 7, t4 # E : find dest misalignment + and a1, 7, t5 # E : find src misalignment + /* Conditionally load the first destination word and a bytemask + with 0xff indicating that the destination byte is sacrosanct. */ + mov zero, t0 # E : + + mov zero, t6 # E : + beq t4, 1f # U : + ldl_u t0, 0(a0) # L : + ldi t6, -1 # E : + + mask3b t6, a0, t6 # U : + nop + nop +1: subl a1, t4, a1 # E : sub dest misalignment from src addr + + /* If source misalignment is larger than dest misalignment, we need + extra startup checks to avoid SEGV. */ + + cmplt t4, t5, t8 # E : + ext3b t1, a1, t1 # U : shift src into place + ldi t2, -1 # E : for creating masks later + beq t8, $u_head # U : (stall) + + mask7b t2, t5, t2 # U : begin src byte validity mask + cmpgeb zero, t1, t7 # E : is there a zero? + ext3b t2, a1, t2 # U : + or t7, t10, t5 # E : test for end-of-count too + + cmpgeb zero, t2, t3 # E : + seleq a2, t5, t7, t7 # E : Latency=2, extra map slot + nop # E : keep with seleq + andnot t7, t3, t7 # E : (stall) + + beq t7, $u_head # U : + /* At this point we've found a zero in the first partial word of + the source. We need to isolate the valid source data and mask + it into the original destination data. (Incidentally, we know + that we'll need at least one byte of that original dest word.) */ + ldl_u t0, 0(a0) # L : + negl t7, t6 # E : build bitmask of bytes <= zero + mask7b t1, t4, t1 # U : + + and t6, t7, t8 # E : + subl t8, 1, t6 # E : (stall) + or t6, t8, t7 # E : (stall) + zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall) + + zapnot t1, t7, t1 # U : to source validity mask + andnot t0, t2, t0 # E : zero place for source to reside + or t0, t1, t0 # E : and put it there (stall both t0, t1) + stl_u t0, 0(a0) # L : (stall) + + ret (t9) # L0 : Latency=3 + + cfi_endproc diff --git a/sysdeps/sw_64/sw6b/sub_n.S b/sysdeps/sw_64/sw6b/sub_n.S new file mode 100644 index 00000000..95c257f7 --- /dev/null +++ b/sysdeps/sw_64/sw6b/sub_n.S @@ -0,0 +1,147 @@ + # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subl $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldl $0,0($18) + ldl $1,8($18) + ldl $4,0($17) + ldl $5,8($17) + addl $17,32,$17 # update s1_ptr + ldl $2,16($18) + subl $4,$0,$20 # 1st main sub + ldl $3,24($18) + subl $19,4,$19 # decr loop cnt + ldl $6,-16($17) + cmpult $4,$20,$25 # compute cy from last sub + ldl $7,-8($17) + addl $1,$25,$28 # cy add + addl $18,32,$18 # update s2_ptr + subl $5,$28,$21 # 2nd main sub + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $5,$21,$25 # compute cy from last add + ldl $0,0($18) + or $8,$25,$25 # combine cy from the two fadds + ldl $1,8($18) + addl $2,$25,$28 # cy add + ldl $4,0($17) + subl $6,$28,$22 # 3rd main sub + ldl $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + subl $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + addl $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + addl $0,$25,$28 # cy add + ldl $2,16($18) + subl $4,$28,$20 # 1st main sub + ldl $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldl $6,-16($17) + cmpult $4,$20,$25 # compute cy from last add + ldl $7,-8($17) + or $8,$25,$25 # combine cy from the two fadds + subl $19,4,$19 # decr loop cnt + stl $22,-16($16) + addl $1,$25,$28 # cy add + stl $23,-8($16) + subl $5,$28,$21 # 2nd main sub + addl $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $5,$21,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $2,$25,$28 # cy add + subl $6,$28,$22 # 3rd main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + subl $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + stl $22,-16($16) + stl $23,-8($16) +.Lend2: addl $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldl $0,0($18) + ldl $4,0($17) + subl $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addl $0,$25,$28 # cy add + ldl $0,8($18) + subl $4,$28,$20 # main sub + ldl $1,8($17) + addl $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addl $17,8,$17 + stl $20,0($16) + cmpult $4,$20,$25 # compute cy from last add + subl $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two fadds + addl $16,8,$16 + or $1,$31,$4 + bne $19,.Loop0 +.Lend0: addl $0,$25,$28 # cy add + subl $4,$28,$20 # main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $4,$20,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_sub_n diff --git a/sysdeps/sw_64/sw8a/add_n.S b/sysdeps/sw_64/sw8a/add_n.S new file mode 100644 index 00000000..86e9f9ae --- /dev/null +++ b/sysdeps/sw_64/sw8a/add_n.S @@ -0,0 +1,146 @@ + # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subl $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldl $0,0($18) + ldl $1,8($18) + ldl $4,0($17) + ldl $5,8($17) + addl $17,32,$17 # update s1_ptr + ldl $2,16($18) + addl $0,$4,$20 # 1st main add + ldl $3,24($18) + subl $19,4,$19 # decr loop cnt + ldl $6,-16($17) + cmpult $20,$0,$25 # compute cy from last add + ldl $7,-8($17) + addl $1,$25,$28 # cy add + addl $18,32,$18 # update s2_ptr + addl $5,$28,$21 # 2nd main add + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $21,$28,$25 # compute cy from last add + ldl $0,0($18) + or $8,$25,$25 # combine cy from the two fadds + ldl $1,8($18) + addl $2,$25,$28 # cy add + ldl $4,0($17) + addl $28,$6,$22 # 3rd main add + ldl $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + addl $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + addl $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + addl $0,$25,$28 # cy add + ldl $2,16($18) + addl $4,$28,$20 # 1st main add + ldl $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldl $6,-16($17) + cmpult $20,$28,$25 # compute cy from last add + ldl $7,-8($17) + or $8,$25,$25 # combine cy from the two fadds + subl $19,4,$19 # decr loop cnt + stl $22,-16($16) + addl $1,$25,$28 # cy add + stl $23,-8($16) + addl $5,$28,$21 # 2nd main add + addl $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $21,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $2,$25,$28 # cy add + addl $28,$6,$22 # 3rd main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $22,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + addl $28,$7,$23 # 4th main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $23,$28,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + stl $22,-16($16) + stl $23,-8($16) +.Lend2: addl $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldl $0,0($18) + ldl $4,0($17) + subl $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addl $0,$25,$28 # cy add + ldl $0,8($18) + addl $4,$28,$20 # main add + ldl $4,8($17) + addl $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addl $17,8,$17 + stl $20,0($16) + cmpult $20,$28,$25 # compute cy from last add + subl $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two fadds + addl $16,8,$16 + bne $19,.Loop0 +.Lend0: addl $0,$25,$28 # cy add + addl $4,$28,$20 # main add + cmpult $28,$25,$8 # compute cy from last add + cmpult $20,$28,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_add_n diff --git a/sysdeps/sw_64/sw8a/addmul_1.S b/sysdeps/sw_64/sw8a/addmul_1.S new file mode 100644 index 00000000..95487c26 --- /dev/null +++ b/sysdeps/sw_64/sw8a/addmul_1.S @@ -0,0 +1,475 @@ + # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + # + # Copyright (C) 2000-2023 Free Software Foundation, Inc. + # + # This file is part of the GNU MP Library. + # + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published + # by the Free Software Foundation; either version 2.1 of the License, or (at + # your option) any later version. + # + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + # + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # size $18 + # s2_limb $19 + # + # + # This code was written in close cooperation with pipeline expert + # . Any errors are tege's fault, though. + # + # Register usages for unrolled loop: + # 0-3 mul's + # 4-7 acc's + # 8-15 mul results + # 20,21 carry's + # 22,23 save for stores + # + # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop. + # + # The stores can issue a cycle late so we have paired no-op's to 'catch' + # them, so that further disturbance to the schedule is damped. + # + # We couldn't pair the loads, because the entangled schedule of the + # carry's has to happen on one side {0} of the machine. Note, the total + # use of U0, and the total use of L0 (after attending to the stores). + # which is part of the reason why.... + # + # This is a great schedule for the d_cache, a poor schedule for the + # b_cache. The lockup on U0 means that any stall can't be recovered + # from. Consider a ldl in L1. say that load gets stalled because it + # collides with a fill from the b_Cache. On the next cycle, this load + # gets priority. If first looks at L0, and goes there. The instruction + # we intended for L0 gets to look at L1, which is NOT where we want + # it. It either stalls 1, because it can't go in L0, or goes there, and + # causes a further instruction to stall. + # + # So for b_cache, we're likely going to want to put one or more cycles + # back into the code! And, of course, put in prefetches. For the + # accumulator, flds, intent to modify. For the fmuldiplier, you might + # want ldl, evict next, if you're not wanting to use it again soon. Use + # 256 ahead of present pointer value. At a place where we have an mt + # followed by a bookkeeping, put the bookkeeping in upper, and the + # prefetch into lower. + # + # Note, the usage of physical registers per cycle is smoothed off, as + # much as possible. + # + # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd + # like not to have a ldl or stl to preceded a conditional branch in a + # quadpack. The conditional branch moves the retire pointer one cycle + # later. + # + # Optimization notes: + # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? + # Reserved regs: $29 $30 $31 + # Free caller-saves regs in unrolled code: $24 $25 $28 + # We should swap some of the callee-saves regs for some of the free + # caller-saves regs, saving some overhead cycles. + # Most importantly, we should write fast code for the 0-7 case. + # The code we use there are for the 21164, and runs at 7 cycles/limb + # on the 21264. Should not be hard, if we write specialized code for + # 1-7 limbs (the one for 0 limbs should be straightforward). We then just + # need a jump table indexed by the low 3 bits of the count argument. + + .set noreorder + .set noat + .text + + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .frame $30,0,$26,0 + .prologue 0 + + cmpult $18, 8, $1 + beq $1, $Large + + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $18, 1, $18 # size-- + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $18, $Lend0b # jump if size was == 1 + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $18, 1, $18 # size-- + addl $5, $3, $3 + cmpult $3, $5, $4 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + beq $18, $Lend0a # jump if size was == 2 + + .align 3 +$Loop0: mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + subl $18, 1, $18 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + bne $18, $Loop0 +$Lend0a: + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $5, $0, $0 # combine carries + addl $4, $0, $0 # cy_limb = prod_high + cy + ret $31, ($26), 1 +$Lend0b: + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $0, $5, $0 + ret $31, ($26), 1 + +$Large: + ldi $30, -240($30) + stl $9, 8($30) + stl $10, 16($30) + stl $11, 24($30) + stl $12, 32($30) + stl $13, 40($30) + stl $14, 48($30) + stl $15, 56($30) + + and $18, 7, $20 # count for the first loop, 0-7 + srl $18, 3, $18 # count for unrolled loop + bis $31, $31, $0 + beq $20, $Lunroll + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $20, 1, $20 # size-- + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $20, $Lend1b # jump if size was == 1 + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + subl $20, 1, $20 # size-- + addl $5, $3, $3 + cmpult $3, $5, $4 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + beq $20, $Lend1a # jump if size was == 2 + + .align 3 +$Loop1: mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + subl $20, 1, $20 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldl $2, 0($17) # $2 = s1_limb + addl $17, 8, $17 # s1_ptr++ + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + bne $20, $Loop1 + +$Lend1a: + mull $2, $19, $3 # $3 = prod_low + ldl $5, 0($16) # $5 = *res_ptr + addl $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addl $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $5, $0, $0 # combine carries + addl $4, $0, $0 # cy_limb = prod_high + cy + br $31, $Lunroll +$Lend1b: + addl $5, $3, $3 + cmpult $3, $5, $5 + stl $3, 0($16) + addl $16, 8, $16 # res_ptr++ + addl $0, $5, $0 + +$Lunroll: + ldi $17, -16($17) # L1 bookkeeping + ldi $16, -16($16) # L1 bookkeeping + bis $0, $31, $12 + + # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldl $2, 16($17) # L1 + ldl $3, 24($17) # L1 + ldi $18, -1($18) # L1 bookkeeping + ldl $6, 16($16) # L1 + ldl $7, 24($16) # L1 + ldl $0, 32($17) # L1 + mull $19, $2, $13 # U1 + ldl $1, 40($17) # L1 + umulh $19, $2, $14 # U1 + mull $19, $3, $15 # U1 + ldi $17, 64($17) # L1 bookkeeping + ldl $4, 32($16) # L1 + ldl $5, 40($16) # L1 + umulh $19, $3, $8 # U1 + ldl $2, -16($17) # L1 + mull $19, $0, $9 # U1 + ldl $3, -8($17) # L1 + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + mull $19, $1, $11 # U1 + cmpult $6, $13, $20 # L0 lo add => carry + ldi $16, 64($16) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, -16($16) # L1 + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, -8($16) # L1 + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 0($17) # L1 + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 8($17) # L1 + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # L0 lo + acc + stl $22, -48($16) # L0 + stl $23, -40($16) # L1 + mull $19, $3, $15 # U1 + addl $8, $21, $8 # U0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + ble $18, $Lend # U1 bookkeeping + + # ____ MAIN UNROLLED LOOP ____ + .align 4 +$Loop: + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 0($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 8($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + ldl $2, 16($17) # L1 + + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + ldl $3, 24($17) # L1 + + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, -32($16) # L0 + stl $23, -24($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addl $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $18, -1($18) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, 16($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, 24($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 32($17) # L1 + + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 40($17) # L1 + + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # U0 lo + acc + stl $22, -16($16) # L0 + stl $23, -8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addl $8, $21, $8 # L0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $17, 64($17) # L1 bookkeeping + addl $4, $8, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 32($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 40($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + ldl $2, -16($17) # L1 + + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + ldl $3, -8($17) # L1 + + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, 0($16) # L0 + stl $23, 8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addl $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + ldi $16, 64($16) # L1 bookkeeping + addl $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + ldl $6, -16($16) # L1 + + bis $31, $31, $31 # U1 mt + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + ldl $7, -8($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + ldl $0, 0($17) # L1 + + mull $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + ldl $1, 8($17) # L1 + + umulh $19, $2, $14 # U1 + addl $4, $9, $4 # L0 lo + acc + stl $22, -48($16) # L0 + stl $23, -40($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mull $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addl $8, $21, $8 # U0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + bis $31, $31, $31 # L1 mt + bgt $18, $Loop # U1 bookkeeping + +# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + ldl $4, 0($16) # L1 + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + ldl $5, 8($16) # L1 + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + mull $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + umulh $19, $0, $10 # U1 + addl $6, $13, $6 # L0 lo + acc + stl $22, -32($16) # L0 + stl $23, -24($16) # L1 + mull $19, $1, $11 # U1 + addl $12, $21, $12 # U0 hi mul + carry + cmpult $6, $13, $20 # L0 lo add => carry + addl $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addl $14, $20, $14 # U0 hi mul + carry + addl $7, $15, $23 # L0 lo + acc + addl $14, $21, $14 # U0 hi mul + carry + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addl $23, $14, $23 # U0 hi add => answer + cmpult $23, $14, $21 # L0 hi add => carry + addl $8, $20, $8 # U0 hi mul + carry + addl $4, $9, $4 # U0 lo + acc + stl $22, -16($16) # L0 + stl $23, -8($16) # L1 + bis $31, $31, $31 # L0 st slosh + addl $8, $21, $8 # L0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addl $4, $8, $22 # U0 hi add => answer + cmpult $22, $8, $21 # L0 hi add => carry + addl $10, $20, $10 # U0 hi mul + carry + addl $5, $11, $23 # L0 lo + acc + addl $10, $21, $10 # L0 hi mul + carry + cmpult $23, $11, $20 # L0 lo add => carry + addl $23, $10, $23 # U0 hi add => answer + cmpult $23, $10, $21 # L0 hi add => carry + addl $12, $20, $12 # U0 hi mul + carry + stl $22, 0($16) # L0 + stl $23, 8($16) # L1 + addl $12, $21, $0 # U0 hi mul + carry + + ldl $9, 8($30) + ldl $10, 16($30) + ldl $11, 24($30) + ldl $12, 32($30) + ldl $13, 40($30) + ldl $14, 48($30) + ldl $15, 56($30) + ldi $30, 240($30) + ret $31, ($26), 1 + + .end __mpn_addmul_1 diff --git a/sysdeps/sw_64/sw8a/lshift.S b/sysdeps/sw_64/sw8a/lshift.S new file mode 100644 index 00000000..76f1fb0e --- /dev/null +++ b/sysdeps/sw_64/sw8a/lshift.S @@ -0,0 +1,172 @@ + # Sw_64 __mpn_lshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the sw_64. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addl $18,$17,$17 # make r17 point at end of s1 + ldl $4,-8($17) # load first limb + subl $31,$19,$20 + s8addl $18,$16,$16 # make r16 point at end of RES + subl $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + srl $4,$20,$0 # compute function result + + beq $28,.L0 + subl $18,$28,$18 + + .align 3 +.Loop0: ldl $3,-16($17) + subl $16,8,$16 + sll $4,$19,$5 + subl $17,8,$17 + subl $28,1,$28 + srl $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stl $8,0($16) + bne $28,.Loop0 + +.L0: sll $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldl $1,-16($17) + subl $18,4,$18 + ldl $2,-24($17) + ldl $3,-32($17) + ldl $4,-40($17) + beq $18,.Lend1 + # warm up phase 2 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + ldl $1,-48($17) + sll $2,$19,$22 + ldl $2,-56($17) + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + ldl $3,-64($17) + sll $4,$19,$24 + ldl $4,-72($17) + subl $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + + srl $1,$20,$7 + subl $18,4,$18 + sll $1,$19,$21 + unop # ldl $31,-96($17) + + srl $2,$20,$8 + ldl $1,-80($17) + sll $2,$19,$22 + ldl $2,-88($17) + + stl $5,-24($16) + or $7,$24,$7 + stl $6,-32($16) + or $8,$21,$8 + + srl $3,$20,$5 + unop # ldl $31,-96($17) + sll $3,$19,$23 + subl $16,32,$16 + + srl $4,$20,$6 + ldl $3,-96($17) + sll $4,$19,$24 + ldl $4,-104($17) + + subl $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + stl $5,-24($16) + or $7,$24,$7 + stl $6,-32($16) + or $8,$21,$8 + srl $3,$20,$5 + sll $3,$19,$23 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 2/2 + stl $7,-40($16) + or $5,$22,$5 + stl $8,-48($16) + or $6,$23,$6 + stl $5,-56($16) + stl $6,-64($16) + # cool down phase 2/3 + stl $24,-72($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 1/2 + stl $7,-8($16) + or $5,$22,$5 + stl $8,-16($16) + or $6,$23,$6 + stl $5,-24($16) + stl $6,-32($16) + stl $24,-40($16) + ret $31,($26),1 + +.Lend: stl $24,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/sw_64/sw8a/rshift.S b/sysdeps/sw_64/sw8a/rshift.S new file mode 100644 index 00000000..ec2a78b0 --- /dev/null +++ b/sysdeps/sw_64/sw8a/rshift.S @@ -0,0 +1,170 @@ + # Sw_64 __mpn_rshift -- + + # Copyright (C) 1994-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 3.25 cycles/limb on the sw_64. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldl $4,0($17) # load first limb + subl $31,$19,$20 + subl $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + sll $4,$20,$0 # compute function result + + beq $28,.L0 + subl $18,$28,$18 + + .align 3 +.Loop0: ldl $3,8($17) + addl $16,8,$16 + srl $4,$19,$5 + addl $17,8,$17 + subl $28,1,$28 + sll $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stl $8,-8($16) + bne $28,.Loop0 + +.L0: srl $4,$19,$24 + beq $18,.Lend + # warm up phase 1 + ldl $1,8($17) + subl $18,4,$18 + ldl $2,16($17) + ldl $3,24($17) + ldl $4,32($17) + beq $18,.Lend1 + # warm up phase 2 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + ldl $1,40($17) + srl $2,$19,$22 + ldl $2,48($17) + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + ldl $3,56($17) + srl $4,$19,$24 + ldl $4,64($17) + subl $18,4,$18 + beq $18,.Lend2 + .align 4 + # main loop +.Loop: stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + + sll $1,$20,$7 + subl $18,4,$18 + srl $1,$19,$21 + unop # ldl $31,-96($17) + + sll $2,$20,$8 + ldl $1,72($17) + srl $2,$19,$22 + ldl $2,80($17) + + stl $5,16($16) + or $7,$24,$7 + stl $6,24($16) + or $8,$21,$8 + + sll $3,$20,$5 + unop # ldl $31,-96($17) + srl $3,$19,$23 + addl $16,32,$16 + + sll $4,$20,$6 + ldl $3,88($17) + srl $4,$19,$24 + ldl $4,96($17) + + addl $17,32,$17 + bne $18,.Loop + # cool down phase 2/1 +.Lend2: stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + stl $5,16($16) + or $7,$24,$7 + stl $6,24($16) + or $8,$21,$8 + sll $3,$20,$5 + srl $3,$19,$23 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 2/2 + stl $7,32($16) + or $5,$22,$5 + stl $8,40($16) + or $6,$23,$6 + stl $5,48($16) + stl $6,56($16) + # cool down phase 2/3 + stl $24,64($16) + ret $31,($26),1 + + # cool down phase 1/1 +.Lend1: sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 1/2 + stl $7,0($16) + or $5,$22,$5 + stl $8,8($16) + or $6,$23,$6 + stl $5,16($16) + stl $6,24($16) + stl $24,32($16) + ret $31,($26),1 + +.Lend: stl $24,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/sw_64/sw8a/sub_n.S b/sysdeps/sw_64/sw8a/sub_n.S new file mode 100644 index 00000000..95c257f7 --- /dev/null +++ b/sysdeps/sw_64/sw8a/sub_n.S @@ -0,0 +1,147 @@ + # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + or $31,$31,$25 # clear cy + subl $19,4,$19 # decr loop cnt + blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop + # Start software pipeline for 1st loop + ldl $0,0($18) + ldl $1,8($18) + ldl $4,0($17) + ldl $5,8($17) + addl $17,32,$17 # update s1_ptr + ldl $2,16($18) + subl $4,$0,$20 # 1st main sub + ldl $3,24($18) + subl $19,4,$19 # decr loop cnt + ldl $6,-16($17) + cmpult $4,$20,$25 # compute cy from last sub + ldl $7,-8($17) + addl $1,$25,$28 # cy add + addl $18,32,$18 # update s2_ptr + subl $5,$28,$21 # 2nd main sub + cmpult $28,$25,$8 # compute cy from last add + blt $19,.Lend1 # if less than 4 limbs remain, jump + # 1st loop handles groups of 4 limbs in a software pipeline + .align 4 +.Loop: cmpult $5,$21,$25 # compute cy from last add + ldl $0,0($18) + or $8,$25,$25 # combine cy from the two fadds + ldl $1,8($18) + addl $2,$25,$28 # cy add + ldl $4,0($17) + subl $6,$28,$22 # 3rd main sub + ldl $5,8($17) + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + subl $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + addl $17,32,$17 # update s1_ptr + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + addl $0,$25,$28 # cy add + ldl $2,16($18) + subl $4,$28,$20 # 1st main sub + ldl $3,24($18) + cmpult $28,$25,$8 # compute cy from last add + ldl $6,-16($17) + cmpult $4,$20,$25 # compute cy from last add + ldl $7,-8($17) + or $8,$25,$25 # combine cy from the two fadds + subl $19,4,$19 # decr loop cnt + stl $22,-16($16) + addl $1,$25,$28 # cy add + stl $23,-8($16) + subl $5,$28,$21 # 2nd main sub + addl $18,32,$18 # update s2_ptr + cmpult $28,$25,$8 # compute cy from last add + bge $19,.Loop + # Finish software pipeline for 1st loop +.Lend1: cmpult $5,$21,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $2,$25,$28 # cy add + subl $6,$28,$22 # 3rd main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $6,$22,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + stl $21,8($16) + addl $3,$25,$28 # cy add + subl $7,$28,$23 # 4th main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $7,$23,$25 # compute cy from last add + or $8,$25,$25 # combine cy from the two fadds + addl $16,32,$16 # update res_ptr + stl $22,-16($16) + stl $23,-8($16) +.Lend2: addl $19,4,$19 # restore loop cnt + beq $19,.Lret + # Start software pipeline for 2nd loop + ldl $0,0($18) + ldl $4,0($17) + subl $19,1,$19 + beq $19,.Lend0 + # 2nd loop handles remaining 1-3 limbs + .align 4 +.Loop0: addl $0,$25,$28 # cy add + ldl $0,8($18) + subl $4,$28,$20 # main sub + ldl $1,8($17) + addl $18,8,$18 + cmpult $28,$25,$8 # compute cy from last add + addl $17,8,$17 + stl $20,0($16) + cmpult $4,$20,$25 # compute cy from last add + subl $19,1,$19 # decr loop cnt + or $8,$25,$25 # combine cy from the two fadds + addl $16,8,$16 + or $1,$31,$4 + bne $19,.Loop0 +.Lend0: addl $0,$25,$28 # cy add + subl $4,$28,$20 # main sub + cmpult $28,$25,$8 # compute cy from last add + cmpult $4,$20,$25 # compute cy from last add + stl $20,0($16) + or $8,$25,$25 # combine cy from the two fadds + +.Lret: or $25,$31,$0 # return cy + ret $31,($26),1 + .end __mpn_sub_n diff --git a/sysdeps/sw_64/udiv_qrnnd.S b/sysdeps/sw_64/udiv_qrnnd.S new file mode 100644 index 00000000..054034cd --- /dev/null +++ b/sysdeps/sw_64/udiv_qrnnd.S @@ -0,0 +1,159 @@ + # Sw_64 1621 __udiv_qrnnd + + # Copyright (C) 1992-2023 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library. If not, see . + +#include + + .set noreorder + .set noat + + .text + +LEAF(__udiv_qrnnd, 0) +#ifdef PROF + ldgp gp, 0(pv) + ldi AT, _mcount + call AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + +#define cnt $2 +#define tmp $3 +#define rem_ptr $16 +#define n1 $17 +#define n0 $18 +#define d $19 +#define qb $20 + + ldi cnt,16 + blt d,$largedivisor + +$loop1: cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule d,n1,qb + subl n1,d,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule d,n1,qb + subl n1,d,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule d,n1,qb + subl n1,d,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule d,n1,qb + subl n1,d,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + subl cnt,1,cnt + bgt cnt,$loop1 + stl n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + +$largedivisor: + and n0,1,$4 + + srl n0,1,n0 + sll n1,63,tmp + or tmp,n0,n0 + srl n1,1,n1 + + and d,1,$6 + srl d,1,$5 + addl $5,$6,$5 + +$loop2: cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule $5,n1,qb + subl n1,$5,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule $5,n1,qb + subl n1,$5,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule $5,n1,qb + subl n1,$5,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addl n1,n1,n1 + bis n1,tmp,n1 + addl n0,n0,n0 + cmpule $5,n1,qb + subl n1,$5,tmp + selne qb,tmp,n1,n1 + bis n0,qb,n0 + subl cnt,1,cnt + bgt cnt,$loop2 + + addl n1,n1,n1 + addl $4,n1,n1 + bne $6,$Odd + stl n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + +$Odd: + /* q' in n0. r' in n1 */ + addl n1,n0,n1 + + cmpult n1,n0,tmp # tmp := carry from addl + subl n1,d,AT + addl n0,tmp,n0 + selne tmp,AT,n1,n1 + + cmpult n1,d,tmp + addl n0,1,AT + seleq tmp,AT,n0,n0 + subl n1,d,AT + seleq tmp,AT,n1,n1 + + stl n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + + .end __udiv_qrnnd -- 2.25.1