7989 lines
211 KiB
Diff
7989 lines
211 KiB
Diff
|
|
From 8045463341b2495da7b2e7dc308a023764315bbe Mon Sep 17 00:00:00 2001
|
||
|
|
From: swcompiler <lc@wxiat.com>
|
||
|
|
Date: Fri, 29 Nov 2024 14:15:45 +0800
|
||
|
|
Subject: [PATCH 11/23] Sw64: Integer Operation Support
|
||
|
|
|
||
|
|
---
|
||
|
|
sysdeps/sw_64/add_n.S | 118 +++++++++
|
||
|
|
sysdeps/sw_64/addmul_1.S | 89 +++++++
|
||
|
|
sysdeps/sw_64/bzero.S | 107 ++++++++
|
||
|
|
sysdeps/sw_64/div.S | 83 ++++++
|
||
|
|
sysdeps/sw_64/div_libc.h | 170 ++++++++++++
|
||
|
|
sysdeps/sw_64/divl.S | 96 +++++++
|
||
|
|
sysdeps/sw_64/divlu.S | 4 +
|
||
|
|
sysdeps/sw_64/divq.S | 290 +++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/divqu.S | 292 +++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/htonl.S | 43 +++
|
||
|
|
sysdeps/sw_64/htons.S | 39 +++
|
||
|
|
sysdeps/sw_64/ldiv.S | 222 ++++++++++++++++
|
||
|
|
sysdeps/sw_64/lldiv.S | 1 +
|
||
|
|
sysdeps/sw_64/lshift.S | 107 ++++++++
|
||
|
|
sysdeps/sw_64/mul_1.S | 82 ++++++
|
||
|
|
sysdeps/sw_64/reml.S | 93 +++++++
|
||
|
|
sysdeps/sw_64/remlu.S | 4 +
|
||
|
|
sysdeps/sw_64/remq.S | 274 ++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/remqu.S | 292 +++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/rshift.S | 105 ++++++++
|
||
|
|
sysdeps/sw_64/sub_n.S | 118 +++++++++
|
||
|
|
sysdeps/sw_64/submul_1.S | 89 +++++++
|
||
|
|
sysdeps/sw_64/sw6a/add_n.S | 146 +++++++++++
|
||
|
|
sysdeps/sw_64/sw6a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw6a/lshift.S | 172 ++++++++++++
|
||
|
|
sysdeps/sw_64/sw6a/rshift.S | 170 ++++++++++++
|
||
|
|
sysdeps/sw_64/sw6a/sub_n.S | 147 +++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/add_n.S | 146 +++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/lshift.S | 172 ++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/memcpy.S | 416 +++++++++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/memset.S | 312 ++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/rshift.S | 170 ++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/stxcpy.S | 314 ++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/stxncpy.S | 392 ++++++++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw6b/sub_n.S | 147 +++++++++++
|
||
|
|
sysdeps/sw_64/sw8a/add_n.S | 146 +++++++++++
|
||
|
|
sysdeps/sw_64/sw8a/addmul_1.S | 475 ++++++++++++++++++++++++++++++++++
|
||
|
|
sysdeps/sw_64/sw8a/lshift.S | 172 ++++++++++++
|
||
|
|
sysdeps/sw_64/sw8a/rshift.S | 170 ++++++++++++
|
||
|
|
sysdeps/sw_64/sw8a/sub_n.S | 147 +++++++++++
|
||
|
|
sysdeps/sw_64/udiv_qrnnd.S | 159 ++++++++++++
|
||
|
|
42 files changed, 7641 insertions(+)
|
||
|
|
create mode 100644 sysdeps/sw_64/add_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/addmul_1.S
|
||
|
|
create mode 100644 sysdeps/sw_64/bzero.S
|
||
|
|
create mode 100644 sysdeps/sw_64/div.S
|
||
|
|
create mode 100644 sysdeps/sw_64/div_libc.h
|
||
|
|
create mode 100644 sysdeps/sw_64/divl.S
|
||
|
|
create mode 100644 sysdeps/sw_64/divlu.S
|
||
|
|
create mode 100644 sysdeps/sw_64/divq.S
|
||
|
|
create mode 100644 sysdeps/sw_64/divqu.S
|
||
|
|
create mode 100644 sysdeps/sw_64/htonl.S
|
||
|
|
create mode 100644 sysdeps/sw_64/htons.S
|
||
|
|
create mode 100644 sysdeps/sw_64/ldiv.S
|
||
|
|
create mode 100644 sysdeps/sw_64/lldiv.S
|
||
|
|
create mode 100644 sysdeps/sw_64/lshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/mul_1.S
|
||
|
|
create mode 100644 sysdeps/sw_64/reml.S
|
||
|
|
create mode 100644 sysdeps/sw_64/remlu.S
|
||
|
|
create mode 100644 sysdeps/sw_64/remq.S
|
||
|
|
create mode 100644 sysdeps/sw_64/remqu.S
|
||
|
|
create mode 100644 sysdeps/sw_64/rshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sub_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/submul_1.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6a/add_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6a/addmul_1.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6a/lshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6a/rshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6a/sub_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/add_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/addmul_1.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/lshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/memcpy.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/memset.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/rshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/stxcpy.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/stxncpy.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw6b/sub_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw8a/add_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw8a/addmul_1.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw8a/lshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw8a/rshift.S
|
||
|
|
create mode 100644 sysdeps/sw_64/sw8a/sub_n.S
|
||
|
|
create mode 100644 sysdeps/sw_64/udiv_qrnnd.S
|
||
|
|
|
||
|
|
diff --git a/sysdeps/sw_64/add_n.S b/sysdeps/sw_64/add_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..8c5c8c08
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/add_n.S
|
||
|
|
@@ -0,0 +1,118 @@
|
||
|
|
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
|
||
|
|
+ # store sum in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_add_n
|
||
|
|
+ .ent __mpn_add_n
|
||
|
|
+__mpn_add_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ ldl $3,0($17)
|
||
|
|
+ ldl $4,0($18)
|
||
|
|
+
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ and $19,4-1,$2 # number of limbs in first loop
|
||
|
|
+ bis $31,$31,$0
|
||
|
|
+ beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop
|
||
|
|
+
|
||
|
|
+ subl $19,$2,$19
|
||
|
|
+
|
||
|
|
+.Loop0: subl $2,1,$2
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $4,$0,$4
|
||
|
|
+ ldl $6,8($18)
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ addl $3,$4,$4
|
||
|
|
+ cmpult $4,$3,$0
|
||
|
|
+ stl $4,0($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ bis $5,$5,$3
|
||
|
|
+ bis $6,$6,$4
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ bne $2,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: beq $19,.Lend
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop: subl $19,4,$19
|
||
|
|
+
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $4,$0,$4
|
||
|
|
+ ldl $6,8($18)
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ addl $3,$4,$4
|
||
|
|
+ cmpult $4,$3,$0
|
||
|
|
+ stl $4,0($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ ldl $3,16($17)
|
||
|
|
+ addl $6,$0,$6
|
||
|
|
+ ldl $4,16($18)
|
||
|
|
+ cmpult $6,$0,$1
|
||
|
|
+ addl $5,$6,$6
|
||
|
|
+ cmpult $6,$5,$0
|
||
|
|
+ stl $6,8($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ ldl $5,24($17)
|
||
|
|
+ addl $4,$0,$4
|
||
|
|
+ ldl $6,24($18)
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ addl $3,$4,$4
|
||
|
|
+ cmpult $4,$3,$0
|
||
|
|
+ stl $4,16($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ ldl $3,32($17)
|
||
|
|
+ addl $6,$0,$6
|
||
|
|
+ ldl $4,32($18)
|
||
|
|
+ cmpult $6,$0,$1
|
||
|
|
+ addl $5,$6,$6
|
||
|
|
+ cmpult $6,$5,$0
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ addl $17,32,$17
|
||
|
|
+ addl $18,32,$18
|
||
|
|
+ addl $16,32,$16
|
||
|
|
+ bne $19,.Loop
|
||
|
|
+
|
||
|
|
+.Lend: addl $4,$0,$4
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ addl $3,$4,$4
|
||
|
|
+ cmpult $4,$3,$0
|
||
|
|
+ stl $4,0($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ .end __mpn_add_n
|
||
|
|
diff --git a/sysdeps/sw_64/addmul_1.S b/sysdeps/sw_64/addmul_1.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..138e3c69
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/addmul_1.S
|
||
|
|
@@ -0,0 +1,89 @@
|
||
|
|
+ # Sw_64 1621 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
|
||
|
|
+ # the result to a second limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # s2_limb r19
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_addmul_1
|
||
|
|
+ .ent __mpn_addmul_1 2
|
||
|
|
+__mpn_addmul_1:
|
||
|
|
+ .frame $30,0,$26
|
||
|
|
+
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ ldl $5,0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2,$19,$0 # $0 = prod_high
|
||
|
|
+ beq $18,.Lend1 # jump if size was == 1
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ addl $5,$3,$3
|
||
|
|
+ cmpult $3,$5,$4
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $16,8,$16 # res_ptr++
|
||
|
|
+ beq $18,.Lend2 # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop: mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ ldl $5,0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ umulh $2,$19,$4 # $4 = cy_limb
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5,$3,$3
|
||
|
|
+ cmpult $3,$5,$5
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $16,8,$16 # res_ptr++
|
||
|
|
+ addl $5,$0,$0 # combine carries
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+
|
||
|
|
+.Lend2: mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ ldl $5,0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2,$19,$4 # $4 = cy_limb
|
||
|
|
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5,$3,$3
|
||
|
|
+ cmpult $3,$5,$5
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $5,$0,$0 # combine carries
|
||
|
|
+ addl $4,$0,$0 # cy_limb = prod_high + cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+.Lend1: addl $5,$3,$3
|
||
|
|
+ cmpult $3,$5,$5
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $0,$5,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ .end __mpn_addmul_1
|
||
|
|
diff --git a/sysdeps/sw_64/bzero.S b/sysdeps/sw_64/bzero.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..1a020afd
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/bzero.S
|
||
|
|
@@ -0,0 +1,107 @@
|
||
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
||
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+/* Fill a block of memory with zeros. Optimized for the Sw_64 architecture:
|
||
|
|
+
|
||
|
|
+ - memory accessed as aligned quadwords only
|
||
|
|
+ - destination memory not read unless needed for good cache behaviour
|
||
|
|
+ - basic blocks arranged to optimize branch prediction for full-quadword
|
||
|
|
+ aligned memory blocks.
|
||
|
|
+ - partial head and tail quadwords constructed with byte-mask instructions
|
||
|
|
+
|
||
|
|
+*/
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ .set noat
|
||
|
|
+ .set noreorder
|
||
|
|
+
|
||
|
|
+ .text
|
||
|
|
+ .type __bzero, @function
|
||
|
|
+ .globl __bzero
|
||
|
|
+ .usepv __bzero, USEPV_PROF
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+
|
||
|
|
+ /* On entry to this basic block:
|
||
|
|
+ t3 == loop counter
|
||
|
|
+ t4 == bytes in partial final word
|
||
|
|
+ a0 == possibly misaligned destination pointer */
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+bzero_loop:
|
||
|
|
+ beq t3, $tail #
|
||
|
|
+ blbc t3, 0f # skip single store if count even
|
||
|
|
+
|
||
|
|
+ stl_u zero, 0(a0) # e0 : store one word
|
||
|
|
+ subl t3, 1, t3 # .. e1 :
|
||
|
|
+ addl a0, 8, a0 # e0 :
|
||
|
|
+ beq t3, $tail # .. e1 :
|
||
|
|
+
|
||
|
|
+0: stl_u zero, 0(a0) # e0 : store two words
|
||
|
|
+ subl t3, 2, t3 # .. e1 :
|
||
|
|
+ stl_u zero, 8(a0) # e0 :
|
||
|
|
+ addl a0, 16, a0 # .. e1 :
|
||
|
|
+ bne t3, 0b # e1 :
|
||
|
|
+
|
||
|
|
+$tail: bne t4, 1f # is there a tail to do?
|
||
|
|
+ ret # no
|
||
|
|
+
|
||
|
|
+1: ldl_u t0, 0(a0) # yes, load original data
|
||
|
|
+ mask7b t0, t4, t0 #
|
||
|
|
+ stl_u t0, 0(a0) #
|
||
|
|
+ ret #
|
||
|
|
+
|
||
|
|
+__bzero:
|
||
|
|
+#ifdef PROF
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ mov a0, v0 # e0 : move return value in place
|
||
|
|
+ beq a1, $done # .. e1 : early exit for zero-length store
|
||
|
|
+ and a0, 7, t1 # e0 :
|
||
|
|
+ addl a1, t1, a1 # e1 : add dest misalignment to count
|
||
|
|
+ srl a1, 3, t3 # e0 : loop = count >> 3
|
||
|
|
+ and a1, 7, t4 # .. e1 : find number of bytes in tail
|
||
|
|
+ unop # :
|
||
|
|
+ beq t1, bzero_loop # e1 : aligned head, jump right in
|
||
|
|
+
|
||
|
|
+ ldl_u t0, 0(a0) # e0 : load original data to mask into
|
||
|
|
+ cmpult a1, 8, t2 # .. e1 : is this a sub-word set
|
||
|
|
+ bne t2, $oneq # e1 :
|
||
|
|
+
|
||
|
|
+ mask3b t0, a0, t0 # e0 : we span words. finish this partial
|
||
|
|
+ subl t3, 1, t3 # e0 :
|
||
|
|
+ addl a0, 8, a0 # .. e1 :
|
||
|
|
+ stl_u t0, -8(a0) # e0 :
|
||
|
|
+ br bzero_loop # .. e1 :
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$oneq:
|
||
|
|
+ mask3b t0, a0, t2 # e0 :
|
||
|
|
+ mask7b t0, a1, t3 # e0 :
|
||
|
|
+ or t2, t3, t0 # e1 :
|
||
|
|
+ stl_u t0, 0(a0) # e0 :
|
||
|
|
+
|
||
|
|
+$done: ret
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+weak_alias (__bzero, bzero)
|
||
|
|
diff --git a/sysdeps/sw_64/div.S b/sysdeps/sw_64/div.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..6dbdcb7f
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/div.S
|
||
|
|
@@ -0,0 +1,83 @@
|
||
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+#undef FRAME
|
||
|
|
+#ifdef __sw_64_fix__
|
||
|
|
+#define FRAME 0
|
||
|
|
+#else
|
||
|
|
+#define FRAME 16
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ .set noat
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ .globl div
|
||
|
|
+ .ent div
|
||
|
|
+div:
|
||
|
|
+ .frame sp, FRAME, ra
|
||
|
|
+#if FRAME > 0
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+#endif
|
||
|
|
+#ifdef PROF
|
||
|
|
+ .set macro
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .set nomacro
|
||
|
|
+ .prologue 1
|
||
|
|
+#else
|
||
|
|
+ .prologue 0
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ beq $18, $divbyzero
|
||
|
|
+ rfpcr $f10
|
||
|
|
+ _ITOFT2 $17, $f0, 0, $18, $f1, 8
|
||
|
|
+ fcvtld $f0, $f11
|
||
|
|
+ fcvtld $f1, $f12
|
||
|
|
+ fdivd $f11, $f12, $f1
|
||
|
|
+ fcvtdl_z $f1, $f0
|
||
|
|
+ wfpcr $f10
|
||
|
|
+ _FTOIT $f0, $0, 0
|
||
|
|
+
|
||
|
|
+ mulw $0, $18, $1
|
||
|
|
+ subw $17, $1, $1
|
||
|
|
+
|
||
|
|
+ stw $0, 0(a0)
|
||
|
|
+ stw $1, 4(a0)
|
||
|
|
+ mov a0, v0
|
||
|
|
+
|
||
|
|
+#if FRAME > 0
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+#endif
|
||
|
|
+ ret
|
||
|
|
+
|
||
|
|
+$divbyzero:
|
||
|
|
+ mov a0, v0
|
||
|
|
+ ldi a0, GEN_INTDIV
|
||
|
|
+ sys_call HMC_gentrap
|
||
|
|
+ stw zero, 0(v0)
|
||
|
|
+ stw zero, 4(v0)
|
||
|
|
+
|
||
|
|
+#if FRAME > 0
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+#endif
|
||
|
|
+ ret
|
||
|
|
+
|
||
|
|
+ .end div
|
||
|
|
diff --git a/sysdeps/sw_64/div_libc.h b/sysdeps/sw_64/div_libc.h
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..2066924b
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/div_libc.h
|
||
|
|
@@ -0,0 +1,170 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+/* Common bits for implementing software divide. */
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+#ifdef __linux__
|
||
|
|
+# include <asm/gentrap.h>
|
||
|
|
+# include <asm/hmcall.h>
|
||
|
|
+#else
|
||
|
|
+# include <machine/pal.h>
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+/* These are not normal C functions. Argument registers are t10 and t11;
|
||
|
|
+ the result goes in t12; the return address is in t9. Only t12 and AT
|
||
|
|
+ may be clobbered. */
|
||
|
|
+#define X t10
|
||
|
|
+#define Y t11
|
||
|
|
+#define RV t12
|
||
|
|
+#define RA t9
|
||
|
|
+
|
||
|
|
+/* The secureplt format does not allow the division routines to be called
|
||
|
|
+ via plt; there aren't enough registers free to be clobbered. Avoid
|
||
|
|
+ setting the symbol type to STT_FUNC, so that the linker won't be tempted
|
||
|
|
+ to create a plt entry. */
|
||
|
|
+#define funcnoplt notype
|
||
|
|
+
|
||
|
|
+/* None of these functions should use implicit anything. */
|
||
|
|
+ .set nomacro
|
||
|
|
+ .set noat
|
||
|
|
+
|
||
|
|
+/* Code fragment to invoke _mcount for profiling. This should be invoked
|
||
|
|
+ directly after allocation of the stack frame. */
|
||
|
|
+.macro CALL_MCOUNT
|
||
|
|
+#ifdef PROF
|
||
|
|
+ stl ra, 0(sp)
|
||
|
|
+ stl pv, 8(sp)
|
||
|
|
+ stl gp, 16(sp)
|
||
|
|
+ cfi_rel_offset (ra, 0)
|
||
|
|
+ cfi_rel_offset (pv, 8)
|
||
|
|
+ cfi_rel_offset (gp, 16)
|
||
|
|
+ br AT, 1f
|
||
|
|
+ .set macro
|
||
|
|
+1: ldgp gp, 0(AT)
|
||
|
|
+ mov RA, ra
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .set nomacro
|
||
|
|
+ ldl ra, 0(sp)
|
||
|
|
+ ldl pv, 8(sp)
|
||
|
|
+ ldl gp, 16(sp)
|
||
|
|
+ cfi_restore (ra)
|
||
|
|
+ cfi_restore (pv)
|
||
|
|
+ cfi_restore (gp)
|
||
|
|
+ /* Realign subsequent code with what we'd have without this
|
||
|
|
+ macro at all. This means aligned with one arithmetic insn
|
||
|
|
+ used within the bundle. */
|
||
|
|
+ .align 4
|
||
|
|
+ nop
|
||
|
|
+#endif
|
||
|
|
+.endm
|
||
|
|
+
|
||
|
|
+/* In order to make the below work, all top-level divide routines must
|
||
|
|
+ use the same frame size. */
|
||
|
|
+#define FRAME 96
|
||
|
|
+
|
||
|
|
+/* Code fragment to generate an integer divide-by-zero fault. When
|
||
|
|
+ building libc.so, we arrange for there to be one copy of this code
|
||
|
|
+ placed late in the dso, such that all branches are forward. When
|
||
|
|
+ building libc.a, we use multiple copies to avoid having an out of
|
||
|
|
+ range branch. Users should jump to DIVBYZERO. */
|
||
|
|
+
|
||
|
|
+.macro DO_DIVBYZERO
|
||
|
|
+#ifdef PIC
|
||
|
|
+#define DIVBYZERO __divbyzero
|
||
|
|
+ .section .gnu.linkonce.t.divbyzero, "ax", @progbits
|
||
|
|
+ .globl __divbyzero
|
||
|
|
+ .type __divbyzero, @function
|
||
|
|
+ .usepv __divbyzero, no
|
||
|
|
+ .hidden __divbyzero
|
||
|
|
+#else
|
||
|
|
+#define DIVBYZERO $divbyzero
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+DIVBYZERO:
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+
|
||
|
|
+ mov a0, RV
|
||
|
|
+ unop
|
||
|
|
+ ldi a0, GEN_INTDIV
|
||
|
|
+ sys_call HMC_gentrap
|
||
|
|
+
|
||
|
|
+ mov RV, a0
|
||
|
|
+ clr RV
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size DIVBYZERO, .-DIVBYZERO
|
||
|
|
+.endm
|
||
|
|
+
|
||
|
|
+/* Like the sw6a instructions, but fall back to stack use on prior machines. */
|
||
|
|
+#ifdef __sw_64_sw6a__
|
||
|
|
+ .arch sw6a
|
||
|
|
+#endif
|
||
|
|
+#ifdef __sw_64_sw6b__
|
||
|
|
+ .arch sw6b
|
||
|
|
+#endif
|
||
|
|
+#ifdef __sw_64_sw8a__
|
||
|
|
+ .arch sw8a
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+.macro _ITOFS gr, fr, slot
|
||
|
|
+#ifdef __sw_64_fix__
|
||
|
|
+ ifmovs \gr, \fr
|
||
|
|
+#else
|
||
|
|
+ stw \gr, \slot(sp)
|
||
|
|
+ flds \fr, \slot(sp)
|
||
|
|
+#endif
|
||
|
|
+.endm
|
||
|
|
+
|
||
|
|
+.macro _ITOFT gr, fr, slot
|
||
|
|
+#ifdef __sw_64_fix__
|
||
|
|
+ ifmovd \gr, \fr
|
||
|
|
+#else
|
||
|
|
+ stl \gr, \slot(sp)
|
||
|
|
+ fldd \fr, \slot(sp)
|
||
|
|
+#endif
|
||
|
|
+.endm
|
||
|
|
+
|
||
|
|
+.macro _FTOIT fr, gr, slot
|
||
|
|
+#ifdef __sw_64_fix__
|
||
|
|
+ fimovd \fr, \gr
|
||
|
|
+#else
|
||
|
|
+ fstd \fr, \slot(sp)
|
||
|
|
+ ldl \gr, \slot(sp)
|
||
|
|
+#endif
|
||
|
|
+.endm
|
||
|
|
+
|
||
|
|
+/* Similarly, but move two registers. Schedules better for pre-sw6a. */
|
||
|
|
+
|
||
|
|
+.macro _ITOFT2 gr1, fr1, slot1, gr2, fr2, slot2
|
||
|
|
+#ifdef __sw_64_fix__
|
||
|
|
+ ifmovd \gr1, \fr1
|
||
|
|
+ ifmovd \gr2, \fr2
|
||
|
|
+#else
|
||
|
|
+ stl \gr1, \slot1(sp)
|
||
|
|
+ stl \gr2, \slot2(sp)
|
||
|
|
+ fldd \fr1, \slot1(sp)
|
||
|
|
+ fldd \fr2, \slot2(sp)
|
||
|
|
+#endif
|
||
|
|
+.endm
|
||
|
|
diff --git a/sysdeps/sw_64/divl.S b/sysdeps/sw_64/divl.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..1192a0aa
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/divl.S
|
||
|
|
@@ -0,0 +1,96 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+/* 32-bit signed int divide. This is not a normal C function. Argument
|
||
|
|
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
|
||
|
|
+ be clobbered.
|
||
|
|
+
|
||
|
|
+ The FPU can handle all input values except zero. Whee!
|
||
|
|
+
|
||
|
|
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
|
||
|
|
+ for cvttq/c even without /sui being set. It will not, however, properly
|
||
|
|
+ raise the exception, so we don't have to worry about FPCR_INED being clear
|
||
|
|
+ and so dying by SIGFPE. */
|
||
|
|
+
|
||
|
|
+ /*****************************************************************
|
||
|
|
+ # *
|
||
|
|
+ # transform to sw-instruct on 2016111216 *
|
||
|
|
+ # *
|
||
|
|
+ #****************************************************************/
|
||
|
|
+
|
||
|
|
+#ifndef EXTEND
|
||
|
|
+#define EXTEND(S,D) sextl S, D
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ .text
|
||
|
|
+ .align 4
|
||
|
|
+ .globl __divw
|
||
|
|
+ .type __divw, @funcnoplt
|
||
|
|
+ .usepv __divw, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+__divw:
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+ CALL_MCOUNT
|
||
|
|
+ fstd $f0, 0(sp)
|
||
|
|
+ excb
|
||
|
|
+ beq Y, DIVBYZERO
|
||
|
|
+
|
||
|
|
+ fstd $f1, 8(sp)
|
||
|
|
+ fstd $f2, 16(sp)
|
||
|
|
+ fstd $f3, 40(sp)
|
||
|
|
+ fstd $f4, 48(sp)
|
||
|
|
+ cfi_rel_offset ($f0, 0)
|
||
|
|
+ cfi_rel_offset ($f1, 8)
|
||
|
|
+ cfi_rel_offset ($f2, 16)
|
||
|
|
+ cfi_rel_offset ($f3, 40)
|
||
|
|
+ cfi_rel_offset ($f4, 48)
|
||
|
|
+
|
||
|
|
+ rfpcr $f2
|
||
|
|
+ EXTEND (X, RV)
|
||
|
|
+ EXTEND (Y, AT)
|
||
|
|
+ _ITOFT2 RV, $f0, 24, AT, $f1, 32
|
||
|
|
+ fcvtld $f0, $f3
|
||
|
|
+ fcvtld $f1, $f4
|
||
|
|
+ fdivd $f3, $f4, $f1
|
||
|
|
+ fcvtdl_z $f1, $f0
|
||
|
|
+ wfpcr $f2
|
||
|
|
+ _FTOIT $f0, RV, 24
|
||
|
|
+
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ fldd $f2, 16(sp)
|
||
|
|
+ fldd $f3, 40(sp)
|
||
|
|
+ fldd $f4, 48(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_restore ($f2)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ sextl RV, RV
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size __divw, .-__divw
|
||
|
|
+
|
||
|
|
+ DO_DIVBYZERO
|
||
|
|
diff --git a/sysdeps/sw_64/divlu.S b/sysdeps/sw_64/divlu.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..26e1842f
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/divlu.S
|
||
|
|
@@ -0,0 +1,4 @@
|
||
|
|
+#define UNSIGNED
|
||
|
|
+#define EXTEND(S,D) zapnot S, 15, D
|
||
|
|
+#define __divw __divwu
|
||
|
|
+#include <divl.S>
|
||
|
|
diff --git a/sysdeps/sw_64/divq.S b/sysdeps/sw_64/divq.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..61ef58b4
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/divq.S
|
||
|
|
@@ -0,0 +1,290 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+/* 64-bit signed long divide. These are not normal C functions. Argument
|
||
|
|
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
|
||
|
|
+ be clobbered.
|
||
|
|
+
|
||
|
|
+ Theory of operation here is that we can use the FPU divider for virtually
|
||
|
|
+ all operands that we see: all dividend values between -2**53 and 2**53-1
|
||
|
|
+ can be computed directly. Note that divisor values need not be checked
|
||
|
|
+ against that range because the rounded fp value will be close enough such
|
||
|
|
+ that the quotient is < 1, which will properly be truncated to zero when we
|
||
|
|
+ convert back to integer.
|
||
|
|
+
|
||
|
|
+ When the dividend is outside the range for which we can compute exact
|
||
|
|
+ results, we use the fp quotent as an estimate from which we begin refining
|
||
|
|
+ an exact integral value. This reduces the number of iterations in the
|
||
|
|
+ shift-and-subtract loop significantly.
|
||
|
|
+
|
||
|
|
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
|
||
|
|
+ for cvttq/c even without /sui being set. It will not, however, properly
|
||
|
|
+ raise the exception, so we don't have to worry about FPCR_INED being clear
|
||
|
|
+ and so dying by SIGFPE. */
|
||
|
|
+ /*****************************************************************
|
||
|
|
+ # *
|
||
|
|
+ # transform to sw-instruct on 2016111216 *
|
||
|
|
+ # *
|
||
|
|
+ #****************************************************************/
|
||
|
|
+ .text
|
||
|
|
+ .align 4
|
||
|
|
+ .globl __divl
|
||
|
|
+ .type __divl, @funcnoplt
|
||
|
|
+ .usepv __divl, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+__divl:
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+ CALL_MCOUNT
|
||
|
|
+
|
||
|
|
+ /* Get the fp divide insn issued as quickly as possible. After
|
||
|
|
+ that's done, we have at least 22 cycles until its results are
|
||
|
|
+ ready -- all the time in the world to figure out how we're
|
||
|
|
+ going to use the results. */
|
||
|
|
+ fstd $f0, 0(sp)
|
||
|
|
+ excb
|
||
|
|
+ beq Y, DIVBYZERO
|
||
|
|
+
|
||
|
|
+ fstd $f1, 8(sp)
|
||
|
|
+ fstd $f3, 48(sp)
|
||
|
|
+ fstd $f4, 56(sp)
|
||
|
|
+ fstd $f5, 64(sp)
|
||
|
|
+
|
||
|
|
+ cfi_rel_offset ($f0, 0)
|
||
|
|
+ cfi_rel_offset ($f1, 8)
|
||
|
|
+ cfi_rel_offset ($f3, 48)
|
||
|
|
+ cfi_rel_offset ($f4, 56)
|
||
|
|
+ cfi_rel_offset ($f5, 64)
|
||
|
|
+ rfpcr $f3
|
||
|
|
+
|
||
|
|
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
|
||
|
|
+ fcvtld $f0, $f4
|
||
|
|
+ fcvtld $f1, $f5
|
||
|
|
+ fdivd $f4, $f5, $f0
|
||
|
|
+
|
||
|
|
+ /* Check to see if X fit in the double as an exact value. */
|
||
|
|
+ sll X, (64-53), AT
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ sra AT, (64-53), AT
|
||
|
|
+ cmpeq X, AT, AT
|
||
|
|
+ beq AT, $x_big
|
||
|
|
+ /* If we get here, we're expecting exact results from the division.
|
||
|
|
+ Do nothing else besides convert and clean up. */
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+ excb
|
||
|
|
+
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ _FTOIT $f4, RV, 16
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+
|
||
|
|
+$x_big:
|
||
|
|
+ /* If we get here, X is large enough that we don't expect exact
|
||
|
|
+ results, and neither X nor Y got mis-translated for the fp
|
||
|
|
+ division. Our task is to take the fp result, figure out how
|
||
|
|
+ far it's off from the correct result and compute a fixup. */
|
||
|
|
+ stl t0, 32(sp)
|
||
|
|
+ stl t1, 40(sp)
|
||
|
|
+ stl t2, 16(sp)
|
||
|
|
+ stl t5, 24(sp)
|
||
|
|
+ cfi_rel_offset (t0, 32)
|
||
|
|
+ cfi_rel_offset (t1, 40)
|
||
|
|
+ cfi_rel_offset (t2, 16)
|
||
|
|
+ cfi_rel_offset (t5, 24)
|
||
|
|
+
|
||
|
|
+#define Q RV /* quotient */
|
||
|
|
+#define R t0 /* remainder */
|
||
|
|
+#define SY t1 /* scaled Y */
|
||
|
|
+#define S t2 /* scalar */
|
||
|
|
+#define QY t3 /* Q*Y */
|
||
|
|
+
|
||
|
|
+ /* The fixup code below can only handle unsigned values. */
|
||
|
|
+ or X, Y, AT
|
||
|
|
+ mov $31, t5
|
||
|
|
+ blt AT, $fix_sign_in
|
||
|
|
+$fix_sign_in_ret1:
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+
|
||
|
|
+ _FTOIT $f4, Q, 8
|
||
|
|
+ .align 3
|
||
|
|
+$fix_sign_in_ret2:
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ stl t3, 0(sp)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_rel_offset (t3, 0)
|
||
|
|
+
|
||
|
|
+ mull Q, Y, QY
|
||
|
|
+ excb
|
||
|
|
+ stl t4, 8(sp)
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ cfi_rel_offset (t4, 8)
|
||
|
|
+
|
||
|
|
+ subl QY, X, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_high
|
||
|
|
+
|
||
|
|
+$q_high_ret:
|
||
|
|
+ subl X, QY, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_low
|
||
|
|
+
|
||
|
|
+$q_low_ret:
|
||
|
|
+ ldl t0, 32(sp)
|
||
|
|
+ ldl t1, 40(sp)
|
||
|
|
+ ldl t2, 16(sp)
|
||
|
|
+ bne t5, $fix_sign_out
|
||
|
|
+
|
||
|
|
+$fix_sign_out_ret:
|
||
|
|
+ ldl t3, 0(sp)
|
||
|
|
+ ldl t4, 8(sp)
|
||
|
|
+ ldl t5, 24(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore (t0)
|
||
|
|
+ cfi_restore (t1)
|
||
|
|
+ cfi_restore (t2)
|
||
|
|
+ cfi_restore (t3)
|
||
|
|
+ cfi_restore (t4)
|
||
|
|
+ cfi_restore (t5)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+ /* The quotient that we computed was too large. We need to reduce
|
||
|
|
+ it by S such that Y*S >= R. Obviously the closer we get to the
|
||
|
|
+ correct value the better, but overshooting high is ok, as we'll
|
||
|
|
+ fix that up later. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_high:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ subl Q, S, Q
|
||
|
|
+ unop
|
||
|
|
+ subl QY, SY, QY
|
||
|
|
+ br $q_high_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /* The quotient that we computed was too small. Divide Y by the
|
||
|
|
+ current remainder (R) and add that to the existing quotient (Q).
|
||
|
|
+ The expectation, of course, is that R is much smaller than X. */
|
||
|
|
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
|
||
|
|
+ already have a copy of Y in SY and the value 1 in S. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_low:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ /* Shift-down and subtract loop. Each iteration compares our scaled
|
||
|
|
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
|
||
|
|
+ Y's scalar (S) so add it to the quotient (Q). */
|
||
|
|
+2: addl Q, S, t3
|
||
|
|
+ srl S, 1, S
|
||
|
|
+ cmpule SY, R, AT
|
||
|
|
+ subl R, SY, t4
|
||
|
|
+
|
||
|
|
+ selne AT, t3, Q, Q
|
||
|
|
+ selne AT, t4, R, R
|
||
|
|
+ srl SY, 1, SY
|
||
|
|
+ bne S, 2b
|
||
|
|
+
|
||
|
|
+ br $q_low_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$fix_sign_in:
|
||
|
|
+ /* If we got here, then X|Y is negative. Need to adjust everything
|
||
|
|
+ such that we're doing unsigned division in the fixup loop. */
|
||
|
|
+ /* T5 records the changes we had to make:
|
||
|
|
+ bit 0: set if result should be negative.
|
||
|
|
+ bit 2: set if X was negated.
|
||
|
|
+ bit 3: set if Y was negated.
|
||
|
|
+ */
|
||
|
|
+ xor X, Y, AT
|
||
|
|
+ cmplt AT, 0, t5
|
||
|
|
+ cmplt X, 0, AT
|
||
|
|
+ negl X, t0
|
||
|
|
+
|
||
|
|
+ s4addl AT, t5, t5
|
||
|
|
+ selne AT, t0, X, X
|
||
|
|
+ cmplt Y, 0, AT
|
||
|
|
+ negl Y, t0
|
||
|
|
+
|
||
|
|
+ s8addl AT, t5, t5
|
||
|
|
+ selne AT, t0, Y, Y
|
||
|
|
+ unop
|
||
|
|
+ blbc t5, $fix_sign_in_ret1
|
||
|
|
+
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+ _FTOIT $f4, Q, 8
|
||
|
|
+ .align 3
|
||
|
|
+ negl Q, Q
|
||
|
|
+ br $fix_sign_in_ret2
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$fix_sign_out:
|
||
|
|
+ /* Now we get to undo what we did above. */
|
||
|
|
+ /* ??? Is this really faster than just increasing the size of
|
||
|
|
+ the stack frame and storing X and Y in memory? */
|
||
|
|
+ and t5, 8, AT
|
||
|
|
+ negl Y, t4
|
||
|
|
+ selne AT, t4, Y, Y
|
||
|
|
+
|
||
|
|
+ and t5, 4, AT
|
||
|
|
+ negl X, t4
|
||
|
|
+ selne AT, t4, X, X
|
||
|
|
+
|
||
|
|
+ negl RV, t4
|
||
|
|
+ sellbs t5, t4, RV, RV
|
||
|
|
+
|
||
|
|
+ br $fix_sign_out_ret
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size __divl, .-__divl
|
||
|
|
+
|
||
|
|
+ DO_DIVBYZERO
|
||
|
|
diff --git a/sysdeps/sw_64/divqu.S b/sysdeps/sw_64/divqu.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..7b39201e
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/divqu.S
|
||
|
|
@@ -0,0 +1,292 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+/* 64-bit unsigned long divide. These are not normal C functions. Argument
|
||
|
|
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may be
|
||
|
|
+ clobbered.
|
||
|
|
+
|
||
|
|
+ Theory of operation here is that we can use the FPU divider for virtually
|
||
|
|
+ all operands that we see: all dividend values between -2**53 and 2**53-1
|
||
|
|
+ can be computed directly. Note that divisor values need not be checked
|
||
|
|
+ against that range because the rounded fp value will be close enough such
|
||
|
|
+ that the quotient is < 1, which will properly be truncated to zero when we
|
||
|
|
+ convert back to integer.
|
||
|
|
+
|
||
|
|
+ When the dividend is outside the range for which we can compute exact
|
||
|
|
+ results, we use the fp quotent as an estimate from which we begin refining
|
||
|
|
+ an exact integral value. This reduces the number of iterations in the
|
||
|
|
+ shift-and-subtract loop significantly.
|
||
|
|
+
|
||
|
|
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
|
||
|
|
+ for cvttq/c even without /sui being set. It will not, however, properly
|
||
|
|
+ raise the exception, so we don't have to worry about FPCR_INED being clear
|
||
|
|
+ and so dying by SIGFPE. */
|
||
|
|
+ /* transform to sw-instruct on 2016111216 */
|
||
|
|
+ .text
|
||
|
|
+ .align 4
|
||
|
|
+ .globl __divlu
|
||
|
|
+ .type __divlu, @funcnoplt
|
||
|
|
+ .usepv __divlu, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+__divlu:
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+ CALL_MCOUNT
|
||
|
|
+
|
||
|
|
+ /* Get the fp divide insn issued as quickly as possible. After
|
||
|
|
+ that's done, we have at least 22 cycles until its results are
|
||
|
|
+ ready -- all the time in the world to figure out how we're
|
||
|
|
+ going to use the results. */
|
||
|
|
+ beq Y, DIVBYZERO
|
||
|
|
+ fstd $f0, 0(sp)
|
||
|
|
+ fstd $f1, 8(sp)
|
||
|
|
+ fstd $f3, 48(sp)
|
||
|
|
+ fstd $f4, 56(sp)
|
||
|
|
+ fstd $f5, 64(sp)
|
||
|
|
+ stl t0,32(sp)
|
||
|
|
+ stl t1,40(sp)
|
||
|
|
+ cfi_rel_offset ($f0, 0)
|
||
|
|
+ cfi_rel_offset ($f1, 8)
|
||
|
|
+ cfi_rel_offset ($f3, 48)
|
||
|
|
+ cfi_rel_offset ($f4, 56)
|
||
|
|
+ cfi_rel_offset ($f5, 64)
|
||
|
|
+ cfi_rel_offset (t0, 32)
|
||
|
|
+ cfi_rel_offset (t1, 40)
|
||
|
|
+
|
||
|
|
+ rfpcr $f3
|
||
|
|
+ /*add it for there has some err when with -mieee of
|
||
|
|
+ 0xffffffffffffffff/2*/
|
||
|
|
+ rfpcr $f1
|
||
|
|
+ fimovd $f1,t0
|
||
|
|
+ ldi t1,3
|
||
|
|
+ sll t1,58,t1
|
||
|
|
+ bic t0,t1,t0
|
||
|
|
+ ifmovd t0,$f1
|
||
|
|
+ wfpcr $f1
|
||
|
|
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
|
||
|
|
+ fcvtld $f0, $f4
|
||
|
|
+ fcvtld $f1, $f5
|
||
|
|
+ blt X, $x_is_neg
|
||
|
|
+ fdivd $f4, $f5, $f0
|
||
|
|
+
|
||
|
|
+ /* Check to see if Y was mis-converted as signed value. */
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ blt Y, $y_is_neg
|
||
|
|
+
|
||
|
|
+ /* Check to see if X fit in the double as an exact value. */
|
||
|
|
+ srl X, 53, AT
|
||
|
|
+ bne AT, $x_big
|
||
|
|
+
|
||
|
|
+ /* If we get here, we're expecting exact results from the division.
|
||
|
|
+ Do nothing else besides convert and clean up. */
|
||
|
|
+ fcvtdl $f0, $f4
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ _FTOIT $f4, RV, 16
|
||
|
|
+
|
||
|
|
+ ldl t0,32(sp)
|
||
|
|
+ ldl t1,40(sp)
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore (t0)
|
||
|
|
+ cfi_restore (t1)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+$x_is_neg:
|
||
|
|
+ /* If we get here, X is so big that bit 63 is set, which made the
|
||
|
|
+ conversion come out negative. Fix it up lest we not even get
|
||
|
|
+ a good estimate. */
|
||
|
|
+ ldih AT, 0x5f80 /* 2**64 as float. */
|
||
|
|
+ fstd $f2, 24(sp)
|
||
|
|
+ fstd $f6, 72(sp)
|
||
|
|
+ cfi_rel_offset ($f2, 24)
|
||
|
|
+ cfi_rel_offset ($f5, 72)
|
||
|
|
+ _ITOFS AT, $f2, 16
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ faddd $f4, $f2, $f6
|
||
|
|
+ unop
|
||
|
|
+ fdivd $f6, $f5, $f0
|
||
|
|
+ unop
|
||
|
|
+
|
||
|
|
+ /* Ok, we've now the divide issued. Continue with other checks. */
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ unop
|
||
|
|
+ fldd $f2, 24(sp)
|
||
|
|
+ fldd $f6, 72(sp)
|
||
|
|
+ blt Y, $y_is_neg
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_restore ($f2)
|
||
|
|
+ cfi_restore ($f6)
|
||
|
|
+ cfi_remember_state /* for y_is_neg */
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$x_big:
|
||
|
|
+ /* If we get here, X is large enough that we don't expect exact
|
||
|
|
+ results, and neither X nor Y got mis-translated for the fp
|
||
|
|
+ division. Our task is to take the fp result, figure out how
|
||
|
|
+ far it's off from the correct result and compute a fixup. */
|
||
|
|
+ stl t2, 16(sp)
|
||
|
|
+ stl t3, 24(sp)
|
||
|
|
+ cfi_rel_offset (t0, 32)
|
||
|
|
+ cfi_rel_offset (t1, 40)
|
||
|
|
+ cfi_rel_offset (t2, 16)
|
||
|
|
+ cfi_rel_offset (t3, 24)
|
||
|
|
+
|
||
|
|
+#define Q RV /* quotient */
|
||
|
|
+#define R t0 /* remainder */
|
||
|
|
+#define SY t1 /* scaled Y */
|
||
|
|
+#define S t2 /* scalar */
|
||
|
|
+#define QY t3 /* Q*Y */
|
||
|
|
+
|
||
|
|
+ fcvtdl $f0, $f4
|
||
|
|
+ _FTOIT $f4, Q, 8
|
||
|
|
+ mull Q, Y, QY
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ stl t4, 8(sp)
|
||
|
|
+ excb
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ cfi_rel_offset (t4, 8)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+
|
||
|
|
+ subl QY, X, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_high
|
||
|
|
+
|
||
|
|
+$q_high_ret:
|
||
|
|
+ subl X, QY, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_low
|
||
|
|
+
|
||
|
|
+$q_low_ret:
|
||
|
|
+ ldl t4, 8(sp)
|
||
|
|
+ ldl t0, 32(sp)
|
||
|
|
+ ldl t1, 40(sp)
|
||
|
|
+ ldl t2, 16(sp)
|
||
|
|
+
|
||
|
|
+ ldl t3, 24(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore (t0)
|
||
|
|
+ cfi_restore (t1)
|
||
|
|
+ cfi_restore (t2)
|
||
|
|
+ cfi_restore (t3)
|
||
|
|
+ cfi_restore (t4)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+ /* The quotient that we computed was too large. We need to reduce
|
||
|
|
+ it by S such that Y*S >= R. Obviously the closer we get to the
|
||
|
|
+ correct value the better, but overshooting high is ok, as we'll
|
||
|
|
+ fix that up later. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_high:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ subl Q, S, Q
|
||
|
|
+ unop
|
||
|
|
+ subl QY, SY, QY
|
||
|
|
+ br $q_high_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /* The quotient that we computed was too small. Divide Y by the
|
||
|
|
+ current remainder (R) and add that to the existing quotient (Q).
|
||
|
|
+ The expectation, of course, is that R is much smaller than X. */
|
||
|
|
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
|
||
|
|
+ already have a copy of Y in SY and the value 1 in S. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_low:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ /* Shift-down and subtract loop. Each iteration compares our scaled
|
||
|
|
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
|
||
|
|
+ Y's scalar (S) so add it to the quotient (Q). */
|
||
|
|
+2: addl Q, S, t3
|
||
|
|
+ srl S, 1, S
|
||
|
|
+ cmpule SY, R, AT
|
||
|
|
+ subl R, SY, t4
|
||
|
|
+
|
||
|
|
+ selne AT, t3, Q, Q
|
||
|
|
+ selne AT, t4, R, R
|
||
|
|
+ srl SY, 1, SY
|
||
|
|
+ bne S, 2b
|
||
|
|
+
|
||
|
|
+ br $q_low_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+$y_is_neg:
|
||
|
|
+ /* If we get here, Y is so big that bit 63 is set. The results
|
||
|
|
+ from the divide will be completely wrong. Fortunately, the
|
||
|
|
+ quotient must be either 0 or 1, so just compute it directly. */
|
||
|
|
+ cmpule Y, X, RV
|
||
|
|
+ excb
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldl t0,32(sp)
|
||
|
|
+ ldl t1,40(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_restore (t0)
|
||
|
|
+ cfi_restore (t1)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size __divlu, .-__divlu
|
||
|
|
+
|
||
|
|
+ DO_DIVBYZERO
|
||
|
|
diff --git a/sysdeps/sw_64/htonl.S b/sysdeps/sw_64/htonl.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..7fc0aa24
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/htonl.S
|
||
|
|
@@ -0,0 +1,43 @@
|
||
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ENTRY(htonl)
|
||
|
|
+#ifdef PROF
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ .set noat
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .set at
|
||
|
|
+ .prologue 1
|
||
|
|
+#else
|
||
|
|
+ .prologue 0
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ ins6b a0, 7, t0 # t0 = 0000000000AABBCC
|
||
|
|
+ ins1b a0, 3, t1 # t1 = 000000CCDD000000
|
||
|
|
+ or t1, t0, t1 # t1 = 000000CCDDAABBCC
|
||
|
|
+ srl t1, 16, t2 # t2 = 0000000000CCDDAA
|
||
|
|
+ zapnot t1, 0x0A, t0 # t0 = 00000000DD00BB00
|
||
|
|
+ zapnot t2, 0x05, t3 # t3 = 0000000000CC00AA
|
||
|
|
+ addw t0, t3, v0 # v0 = ssssssssDDCCBBAA
|
||
|
|
+ ret
|
||
|
|
+
|
||
|
|
+ END(htonl)
|
||
|
|
+
|
||
|
|
+weak_alias (htonl, ntohl)
|
||
|
|
diff --git a/sysdeps/sw_64/htons.S b/sysdeps/sw_64/htons.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..8a981be1
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/htons.S
|
||
|
|
@@ -0,0 +1,39 @@
|
||
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ENTRY(htons)
|
||
|
|
+#ifdef PROF
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ .set noat
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .set at
|
||
|
|
+ .prologue 1
|
||
|
|
+#else
|
||
|
|
+ .prologue 0
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ ext5b a0, 7, t1 # t1 = bb00
|
||
|
|
+ ext0b a0, 1, v0 # v0 = 00aa
|
||
|
|
+ bis v0, t1, v0 # v0 = bbaa
|
||
|
|
+ ret
|
||
|
|
+
|
||
|
|
+ END(htons)
|
||
|
|
+
|
||
|
|
+weak_alias (htons, ntohs)
|
||
|
|
diff --git a/sysdeps/sw_64/ldiv.S b/sysdeps/sw_64/ldiv.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..7a77d6dd
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/ldiv.S
|
||
|
|
@@ -0,0 +1,222 @@
|
||
|
|
+
|
||
|
|
+/* Copyright (C) 1996-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+ Contributed by Richard Henderson <rth@tamu.edu>.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+#undef FRAME
|
||
|
|
+#ifdef __sw_64_fix__
|
||
|
|
+#define FRAME 0
|
||
|
|
+#else
|
||
|
|
+#define FRAME 16
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+#undef X
|
||
|
|
+#undef Y
|
||
|
|
+#define X $17
|
||
|
|
+#define Y $18
|
||
|
|
+
|
||
|
|
+ .set noat
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ .globl ldiv
|
||
|
|
+ .ent ldiv
|
||
|
|
+ldiv:
|
||
|
|
+ .frame sp, FRAME, ra
|
||
|
|
+#if FRAME > 0
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+#endif
|
||
|
|
+#ifdef PROF
|
||
|
|
+ .set macro
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .set nomacro
|
||
|
|
+ .prologue 1
|
||
|
|
+#else
|
||
|
|
+ .prologue 0
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ beq Y, $divbyzero
|
||
|
|
+ mov Y,t6
|
||
|
|
+ nop
|
||
|
|
+ rfpcr $f10
|
||
|
|
+
|
||
|
|
+ _ITOFT2 X, $f0, 0, Y, $f1, 8
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ fcvtld $f0, $f11
|
||
|
|
+ fcvtld $f1, $f12
|
||
|
|
+ fdivd $f11, $f12, $f0
|
||
|
|
+ unop
|
||
|
|
+
|
||
|
|
+ /* Check to see if X fit in the double as an exact value. */
|
||
|
|
+ sll X, (64-53), AT
|
||
|
|
+ sra AT, (64-53), AT
|
||
|
|
+ cmpeq X, AT, AT
|
||
|
|
+ beq AT, $x_big
|
||
|
|
+
|
||
|
|
+ /* If we get here, we're expecting exact results from the division.
|
||
|
|
+ Do nothing else besides convert and clean up. */
|
||
|
|
+ fcvtdl_z $f0, $f11
|
||
|
|
+ nop
|
||
|
|
+ wfpcr $f10
|
||
|
|
+ _FTOIT $f11, $0, 0
|
||
|
|
+
|
||
|
|
+$egress:
|
||
|
|
+// mull $0, Y, $1
|
||
|
|
+ mull $0, t6, $1
|
||
|
|
+ subl X, $1, $1
|
||
|
|
+
|
||
|
|
+ stl $0, 0($16)
|
||
|
|
+ stl $1, 8($16)
|
||
|
|
+ mov $16, $0
|
||
|
|
+
|
||
|
|
+#if FRAME > 0
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+#endif
|
||
|
|
+ ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$x_big:
|
||
|
|
+ /* If we get here, X is large enough that we don't expect exact
|
||
|
|
+ results, and neither X nor Y got mis-translated for the fp
|
||
|
|
+ division. Our task is to take the fp result, figure out how
|
||
|
|
+ far it's off from the correct result and compute a fixup. */
|
||
|
|
+
|
||
|
|
+#define Q v0 /* quotient */
|
||
|
|
+#define R t0 /* remainder */
|
||
|
|
+#define SY t1 /* scaled Y */
|
||
|
|
+#define S t2 /* scalar */
|
||
|
|
+#define QY t3 /* Q*Y */
|
||
|
|
+
|
||
|
|
+ /* The fixup code below can only handle unsigned values. */
|
||
|
|
+ bis X, Y, AT
|
||
|
|
+ mov $31, t5
|
||
|
|
+ blt AT, $fix_sign_in
|
||
|
|
+$fix_sign_in_ret1:
|
||
|
|
+ fcvtdl_z $f0, $f11
|
||
|
|
+
|
||
|
|
+ _FTOIT $f11, Q, 8
|
||
|
|
+$fix_sign_in_ret2:
|
||
|
|
+ mull Q, Y, QY
|
||
|
|
+ nop
|
||
|
|
+ wfpcr $f10
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ subl QY, X, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_high
|
||
|
|
+
|
||
|
|
+$q_high_ret:
|
||
|
|
+ subl X, QY, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_low
|
||
|
|
+
|
||
|
|
+$q_low_ret:
|
||
|
|
+ negl Q, t4
|
||
|
|
+ sellbs t5, t4, Q, Q
|
||
|
|
+ br $egress
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /* The quotient that we computed was too large. We need to reduce
|
||
|
|
+ it by S such that Y*S >= R. Obviously the closer we get to the
|
||
|
|
+ correct value the better, but overshooting high is ok, as we'll
|
||
|
|
+ fix that up later. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_high:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ subl Q, S, Q
|
||
|
|
+ unop
|
||
|
|
+ subl QY, SY, QY
|
||
|
|
+ br $q_high_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /* The quotient that we computed was too small. Divide Y by the
|
||
|
|
+ current remainder (R) and add that to the existing quotient (Q).
|
||
|
|
+ The expectation, of course, is that R is much smaller than X. */
|
||
|
|
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
|
||
|
|
+ already have a copy of Y in SY and the value 1 in S. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_low:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ /* Shift-down and subtract loop. Each iteration compares our scaled
|
||
|
|
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
|
||
|
|
+ Y's scalar (S) so add it to the quotient (Q). */
|
||
|
|
+2: addl Q, S, t3
|
||
|
|
+ srl S, 1, S
|
||
|
|
+ cmpule SY, R, AT
|
||
|
|
+ subl R, SY, t4
|
||
|
|
+
|
||
|
|
+ selne AT, t3, Q, Q
|
||
|
|
+ selne AT, t4, R, R
|
||
|
|
+ srl SY, 1, SY
|
||
|
|
+ bne S, 2b
|
||
|
|
+
|
||
|
|
+ br $q_low_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$fix_sign_in:
|
||
|
|
+ /* If we got here, then X|Y is negative. Need to adjust everything
|
||
|
|
+ such that we're doing unsigned division in the fixup loop. */
|
||
|
|
+ /* T5 is true if result should be negative. */
|
||
|
|
+ xor X, Y, AT
|
||
|
|
+ cmplt AT, 0, t5
|
||
|
|
+ cmplt X, 0, AT
|
||
|
|
+ negl X, t0
|
||
|
|
+
|
||
|
|
+ selne AT, t0, X, X
|
||
|
|
+ cmplt Y, 0, AT
|
||
|
|
+ negl Y, t0
|
||
|
|
+
|
||
|
|
+ selne AT, t0, Y, Y
|
||
|
|
+ blbc t5, $fix_sign_in_ret1
|
||
|
|
+
|
||
|
|
+ fcvtdl_z $f0, $f11
|
||
|
|
+ _FTOIT $f11, Q, 8
|
||
|
|
+ .align 3
|
||
|
|
+ negl Q, Q
|
||
|
|
+ br $fix_sign_in_ret2
|
||
|
|
+
|
||
|
|
+$divbyzero:
|
||
|
|
+ mov a0, v0
|
||
|
|
+ ldi a0, GEN_INTDIV
|
||
|
|
+ sys_call HMC_gentrap
|
||
|
|
+ stl zero, 0(v0)
|
||
|
|
+ stl zero, 8(v0)
|
||
|
|
+
|
||
|
|
+#if FRAME > 0
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+#endif
|
||
|
|
+ ret
|
||
|
|
+
|
||
|
|
+ .end ldiv
|
||
|
|
+
|
||
|
|
+weak_alias (ldiv, lldiv)
|
||
|
|
+weak_alias (ldiv, imaxdiv)
|
||
|
|
diff --git a/sysdeps/sw_64/lldiv.S b/sysdeps/sw_64/lldiv.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..8a8ef97a
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/lldiv.S
|
||
|
|
@@ -0,0 +1 @@
|
||
|
|
+/* lldiv is the same as ldiv on the Sw_64. */
|
||
|
|
diff --git a/sysdeps/sw_64/lshift.S b/sysdeps/sw_64/lshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..700e9d80
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/lshift.S
|
||
|
|
@@ -0,0 +1,107 @@
|
||
|
|
+ # Sw_64 1621 __mpn_lshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling,
|
||
|
|
+ # it would take 4 cycles/limb. It should be possible to get down to 3
|
||
|
|
+ # cycles/limb since both ldl and stl can be paired with the other used
|
||
|
|
+ # instructions. But there are many restrictions in the 1621 pipeline that
|
||
|
|
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
|
||
|
|
+
|
||
|
|
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
|
||
|
|
+ # 2. Only aligned instruction pairs can be paired.
|
||
|
|
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_lshift
|
||
|
|
+ .ent __mpn_lshift
|
||
|
|
+__mpn_lshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ s8addl $18,$17,$17 # make r17 point at end of s1
|
||
|
|
+ ldl $4,-8($17) # load first limb
|
||
|
|
+ subl $17,8,$17
|
||
|
|
+ subl $31,$19,$7
|
||
|
|
+ s8addl $18,$16,$16 # make r16 point at end of RES
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$20 # number of limbs in first loop
|
||
|
|
+ srl $4,$7,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $20,.L0
|
||
|
|
+ subl $18,$20,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0:
|
||
|
|
+ ldl $3,-8($17)
|
||
|
|
+ subl $16,8,$16
|
||
|
|
+ subl $17,8,$17
|
||
|
|
+ subl $20,1,$20
|
||
|
|
+ sll $4,$19,$5
|
||
|
|
+ srl $3,$7,$6
|
||
|
|
+ bis $3,$3,$4
|
||
|
|
+ bis $5,$6,$8
|
||
|
|
+ stl $8,0($16)
|
||
|
|
+ bne $20,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: beq $18,.Lend
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop: ldl $3,-8($17)
|
||
|
|
+ subl $16,32,$16
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ sll $4,$19,$5
|
||
|
|
+ srl $3,$7,$6
|
||
|
|
+
|
||
|
|
+ ldl $4,-16($17)
|
||
|
|
+ sll $3,$19,$1
|
||
|
|
+ bis $5,$6,$8
|
||
|
|
+ stl $8,24($16)
|
||
|
|
+ srl $4,$7,$2
|
||
|
|
+
|
||
|
|
+ ldl $3,-24($17)
|
||
|
|
+ sll $4,$19,$5
|
||
|
|
+ bis $1,$2,$8
|
||
|
|
+ stl $8,16($16)
|
||
|
|
+ srl $3,$7,$6
|
||
|
|
+
|
||
|
|
+ ldl $4,-32($17)
|
||
|
|
+ sll $3,$19,$1
|
||
|
|
+ bis $5,$6,$8
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ srl $4,$7,$2
|
||
|
|
+
|
||
|
|
+ subl $17,32,$17
|
||
|
|
+ bis $1,$2,$8
|
||
|
|
+ stl $8,0($16)
|
||
|
|
+
|
||
|
|
+ bgt $18,.Loop
|
||
|
|
+
|
||
|
|
+.Lend: sll $4,$19,$8
|
||
|
|
+ stl $8,-8($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_lshift
|
||
|
|
diff --git a/sysdeps/sw_64/mul_1.S b/sysdeps/sw_64/mul_1.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..127f4274
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/mul_1.S
|
||
|
|
@@ -0,0 +1,82 @@
|
||
|
|
+ # Sw_64 1621 __mpn_mul_1 -- Multiply a limb vector with a limb and store
|
||
|
|
+ # the result in a second limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # s2_limb r19
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # To improve performance for long fmuldiplications, we would use
|
||
|
|
+ # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
|
||
|
|
+ # these instructions without slowing down the general code: 1. We can
|
||
|
|
+ # only have two prefetches in operation at any time in the Sw_64
|
||
|
|
+ # architecture. 2. There will seldom be any special alignment
|
||
|
|
+ # between RES_PTR and S1_PTR. Maybe we can simply divide the current
|
||
|
|
+ # loop into an inner and outer loop, having the inner loop handle
|
||
|
|
+ # exactly one prefetch block?
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_mul_1
|
||
|
|
+ .ent __mpn_mul_1 2
|
||
|
|
+__mpn_mul_1:
|
||
|
|
+ .frame $30,0,$26
|
||
|
|
+
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ bic $31,$31,$4 # clear cy_limb
|
||
|
|
+ umulh $2,$19,$0 # $0 = prod_high
|
||
|
|
+ beq $18,Lend1 # jump if size was == 1
|
||
|
|
+ ldl $2,8($17) # $2 = s1_limb
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ beq $18,Lend2 # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+Loop: mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ umulh $2,$19,$4 # $4 = cy_limb
|
||
|
|
+ ldl $2,16($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
|
||
|
|
+ stl $3,8($16)
|
||
|
|
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $16,8,$16 # res_ptr++
|
||
|
|
+ bne $18,Loop
|
||
|
|
+
|
||
|
|
+Lend2: mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2,$19,$4 # $4 = cy_limb
|
||
|
|
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ stl $3,8($16)
|
||
|
|
+ addl $4,$0,$0 # cy_limb = prod_high + cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+Lend1: stl $3,0($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ .end __mpn_mul_1
|
||
|
|
diff --git a/sysdeps/sw_64/reml.S b/sysdeps/sw_64/reml.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..56a550d9
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/reml.S
|
||
|
|
@@ -0,0 +1,93 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ Contributed by Richard Henderson <rth@twiddle.net>
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+/* 32-bit signed int remainder. This is not a normal C function. Argument
|
||
|
|
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
|
||
|
|
+ be clobbered.
|
||
|
|
+
|
||
|
|
+ The FPU can handle the division for all input values except zero.
|
||
|
|
+ All we have to do is compute the remainder via multiply-and-subtract.
|
||
|
|
+
|
||
|
|
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
|
||
|
|
+ for cvttq/c even without /sui being set. It will not, however, properly
|
||
|
|
+ raise the exception, so we don't have to worry about FPCR_INED being clear
|
||
|
|
+ and so dying by SIGFPE. */
|
||
|
|
+ /*__reml->__remw 20161111*/
|
||
|
|
+#ifndef EXTEND
|
||
|
|
+#define EXTEND(S,D) sextl S, D
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ .text
|
||
|
|
+ .align 4
|
||
|
|
+ .globl __remw
|
||
|
|
+ .type __remw, @funcnoplt
|
||
|
|
+ .usepv __remw, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+__remw:
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+ CALL_MCOUNT
|
||
|
|
+ fstd $f0, 0(sp)
|
||
|
|
+ excb
|
||
|
|
+ beq Y, DIVBYZERO
|
||
|
|
+
|
||
|
|
+ fstd $f1, 8(sp)
|
||
|
|
+ fstd $f2, 16(sp)
|
||
|
|
+ fstd $f3, 40(sp)
|
||
|
|
+ fstd $f4, 48(sp)
|
||
|
|
+ cfi_rel_offset ($f0, 0)
|
||
|
|
+ cfi_rel_offset ($f1, 8)
|
||
|
|
+ cfi_rel_offset ($f2, 16)
|
||
|
|
+ cfi_rel_offset ($f3, 40)
|
||
|
|
+ cfi_rel_offset ($f4, 48)
|
||
|
|
+
|
||
|
|
+ rfpcr $f2
|
||
|
|
+ EXTEND (X, RV)
|
||
|
|
+ EXTEND (Y, AT)
|
||
|
|
+ _ITOFT2 RV, $f0, 24, AT, $f1, 32
|
||
|
|
+ fcvtld $f0, $f3
|
||
|
|
+ fcvtld $f1, $f4
|
||
|
|
+ fdivd $f3, $f4, $f0
|
||
|
|
+ fcvtdl_z $f0, $f3
|
||
|
|
+
|
||
|
|
+ wfpcr $f2
|
||
|
|
+ _FTOIT $f3, RV, 24
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ mulw RV, Y, RV
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ fldd $f2, 16(sp)
|
||
|
|
+ fldd $f3, 40(sp)
|
||
|
|
+ fldd $f4, 48(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_restore ($f2)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ subw X, RV, RV
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size __remw, .-__remw
|
||
|
|
+
|
||
|
|
+ DO_DIVBYZERO
|
||
|
|
diff --git a/sysdeps/sw_64/remlu.S b/sysdeps/sw_64/remlu.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..3c12f7bf
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/remlu.S
|
||
|
|
@@ -0,0 +1,4 @@
|
||
|
|
+#define UNSIGNED
|
||
|
|
+#define EXTEND(S,D) zapnot S, 15, D
|
||
|
|
+#define __remw __remwu
|
||
|
|
+#include <reml.S>
|
||
|
|
diff --git a/sysdeps/sw_64/remq.S b/sysdeps/sw_64/remq.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..6db7f628
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/remq.S
|
||
|
|
@@ -0,0 +1,274 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+/* 64-bit signed long remainder. These are not normal C functions. Argument
|
||
|
|
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may
|
||
|
|
+ be clobbered.
|
||
|
|
+
|
||
|
|
+ Theory of operation here is that we can use the FPU divider for virtually
|
||
|
|
+ all operands that we see: all dividend values between -2**53 and 2**53-1
|
||
|
|
+ can be computed directly. Note that divisor values need not be checked
|
||
|
|
+ against that range because the rounded fp value will be close enough such
|
||
|
|
+ that the quotient is < 1, which will properly be truncated to zero when we
|
||
|
|
+ convert back to integer.
|
||
|
|
+
|
||
|
|
+ When the dividend is outside the range for which we can compute exact
|
||
|
|
+ results, we use the fp quotent as an estimate from which we begin refining
|
||
|
|
+ an exact integral value. This reduces the number of iterations in the
|
||
|
|
+ shift-and-subtract loop significantly.
|
||
|
|
+
|
||
|
|
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
|
||
|
|
+ for cvttq/c even without /sui being set. It will not, however, properly
|
||
|
|
+ raise the exception, so we don't have to worry about FPCR_INED being clear
|
||
|
|
+ and so dying by SIGFPE. */
|
||
|
|
+ .text
|
||
|
|
+ .align 4
|
||
|
|
+ .globl __reml
|
||
|
|
+ .type __reml, @funcnoplt
|
||
|
|
+ .usepv __reml, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+__reml:
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+ CALL_MCOUNT
|
||
|
|
+
|
||
|
|
+ /* Get the fp divide insn issued as quickly as possible. After
|
||
|
|
+ that's done, we have at least 22 cycles until its results are
|
||
|
|
+ ready -- all the time in the world to figure out how we're
|
||
|
|
+ going to use the results. */
|
||
|
|
+ fstd $f0, 0(sp)
|
||
|
|
+ excb
|
||
|
|
+ beq Y, DIVBYZERO
|
||
|
|
+
|
||
|
|
+ fstd $f1, 8(sp)
|
||
|
|
+ fstd $f3, 48(sp)
|
||
|
|
+ fstd $f4, 56(sp)
|
||
|
|
+ fstd $f5, 64(sp)
|
||
|
|
+ cfi_rel_offset ($f0, 0)
|
||
|
|
+ cfi_rel_offset ($f1, 8)
|
||
|
|
+ cfi_rel_offset ($f3, 48)
|
||
|
|
+ cfi_rel_offset ($f4, 56)
|
||
|
|
+ cfi_rel_offset ($f5, 64)
|
||
|
|
+
|
||
|
|
+ rfpcr $f3
|
||
|
|
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
|
||
|
|
+ fcvtld $f0, $f4
|
||
|
|
+ fcvtld $f1, $f5
|
||
|
|
+ fdivd $f4, $f5, $f0
|
||
|
|
+
|
||
|
|
+ /* Check to see if X fit in the double as an exact value. */
|
||
|
|
+ sll X, (64-53), AT
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ sra AT, (64-53), AT
|
||
|
|
+ cmpeq X, AT, AT
|
||
|
|
+ beq AT, $x_big
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ _FTOIT $f4, AT, 16
|
||
|
|
+ mull AT, Y, AT
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ subl X, AT, RV
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+$x_big:
|
||
|
|
+ /* If we get here, X is large enough that we don't expect exact
|
||
|
|
+ results, and neither X nor Y got mis-translated for the fp
|
||
|
|
+ division. Our task is to take the fp result, figure out how
|
||
|
|
+ far it's off from the correct result and compute a fixup. */
|
||
|
|
+ stl t0, 32(sp)
|
||
|
|
+ stl t1, 40(sp)
|
||
|
|
+ stl t2, 16(sp)
|
||
|
|
+ stl t5, 24(sp)
|
||
|
|
+ cfi_rel_offset (t0, 32)
|
||
|
|
+ cfi_rel_offset (t1, 40)
|
||
|
|
+ cfi_rel_offset (t2, 16)
|
||
|
|
+ cfi_rel_offset (t5, 24)
|
||
|
|
+
|
||
|
|
+#define Q t0 /* quotient */
|
||
|
|
+#define R RV /* remainder */
|
||
|
|
+#define SY t1 /* scaled Y */
|
||
|
|
+#define S t2 /* scalar */
|
||
|
|
+#define QY t3 /* Q*Y */
|
||
|
|
+
|
||
|
|
+ /* The fixup code below can only handle unsigned values. */
|
||
|
|
+ or X, Y, AT
|
||
|
|
+ mov $31, t5
|
||
|
|
+ blt AT, $fix_sign_in
|
||
|
|
+$fix_sign_in_ret1:
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+ _FTOIT $f4, Q, 8
|
||
|
|
+ .align 3
|
||
|
|
+$fix_sign_in_ret2:
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ stl t3, 0(sp)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_rel_offset (t3, 0)
|
||
|
|
+
|
||
|
|
+ mull Q, Y, QY
|
||
|
|
+ stl t4, 8(sp)
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ cfi_rel_offset (t4, 8)
|
||
|
|
+
|
||
|
|
+ subl QY, X, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_high
|
||
|
|
+
|
||
|
|
+$q_high_ret:
|
||
|
|
+ subl X, QY, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_low
|
||
|
|
+
|
||
|
|
+$q_low_ret:
|
||
|
|
+ ldl t0, 32(sp)
|
||
|
|
+ ldl t1, 40(sp)
|
||
|
|
+ ldl t2, 16(sp)
|
||
|
|
+ bne t5, $fix_sign_out
|
||
|
|
+
|
||
|
|
+$fix_sign_out_ret:
|
||
|
|
+ ldl t3, 0(sp)
|
||
|
|
+ ldl t4, 8(sp)
|
||
|
|
+ ldl t5, 24(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore (t0)
|
||
|
|
+ cfi_restore (t1)
|
||
|
|
+ cfi_restore (t2)
|
||
|
|
+ cfi_restore (t3)
|
||
|
|
+ cfi_restore (t4)
|
||
|
|
+ cfi_restore (t5)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+ /* The quotient that we computed was too large. We need to reduce
|
||
|
|
+ it by S such that Y*S >= R. Obviously the closer we get to the
|
||
|
|
+ correct value the better, but overshooting high is ok, as we'll
|
||
|
|
+ fix that up later. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_high:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ subl Q, S, Q
|
||
|
|
+ unop
|
||
|
|
+ subl QY, SY, QY
|
||
|
|
+ br $q_high_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /* The quotient that we computed was too small. Divide Y by the
|
||
|
|
+ current remainder (R) and add that to the existing quotient (Q).
|
||
|
|
+ The expectation, of course, is that R is much smaller than X. */
|
||
|
|
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
|
||
|
|
+ already have a copy of Y in SY and the value 1 in S. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_low:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ /* Shift-down and subtract loop. Each iteration compares our scaled
|
||
|
|
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
|
||
|
|
+ Y's scalar (S) so add it to the quotient (Q). */
|
||
|
|
+2: addl Q, S, t3
|
||
|
|
+ srl S, 1, S
|
||
|
|
+ cmpule SY, R, AT
|
||
|
|
+ subl R, SY, t4
|
||
|
|
+
|
||
|
|
+ selne AT, t3, Q, Q
|
||
|
|
+ selne AT, t4, R, R
|
||
|
|
+ srl SY, 1, SY
|
||
|
|
+ bne S, 2b
|
||
|
|
+
|
||
|
|
+ br $q_low_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$fix_sign_in:
|
||
|
|
+ /* If we got here, then X|Y is negative. Need to adjust everything
|
||
|
|
+ such that we're doing unsigned division in the fixup loop. */
|
||
|
|
+ /* T5 records the changes we had to make:
|
||
|
|
+ bit 0: set if X was negated. Note that the sign of the
|
||
|
|
+ remainder follows the sign of the divisor.
|
||
|
|
+ bit 2: set if Y was negated.
|
||
|
|
+ */
|
||
|
|
+ xor X, Y, t1
|
||
|
|
+ cmplt X, 0, t5
|
||
|
|
+ negl X, t0
|
||
|
|
+ selne t5, t0, X, X
|
||
|
|
+
|
||
|
|
+ cmplt Y, 0, AT
|
||
|
|
+ negl Y, t0
|
||
|
|
+ s4addl AT, t5, t5
|
||
|
|
+ selne AT, t0, Y, Y
|
||
|
|
+
|
||
|
|
+ bge t1, $fix_sign_in_ret1
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+ _FTOIT $f4, Q, 8
|
||
|
|
+ .align 3
|
||
|
|
+ negl Q, Q
|
||
|
|
+ br $fix_sign_in_ret2
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$fix_sign_out:
|
||
|
|
+ /* Now we get to undo what we did above. */
|
||
|
|
+ /* ??? Is this really faster than just increasing the size of
|
||
|
|
+ the stack frame and storing X and Y in memory? */
|
||
|
|
+ and t5, 4, AT
|
||
|
|
+ negl Y, t4
|
||
|
|
+ selne AT, t4, Y, Y
|
||
|
|
+
|
||
|
|
+ negl X, t4
|
||
|
|
+ sellbs t5, t4, X, X
|
||
|
|
+ negl RV, t4
|
||
|
|
+ sellbs t5, t4, RV, RV
|
||
|
|
+
|
||
|
|
+ br $fix_sign_out_ret
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size __reml, .-__reml
|
||
|
|
+
|
||
|
|
+ DO_DIVBYZERO
|
||
|
|
diff --git a/sysdeps/sw_64/remqu.S b/sysdeps/sw_64/remqu.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..946e031b
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/remqu.S
|
||
|
|
@@ -0,0 +1,292 @@
|
||
|
|
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+#include "div_libc.h"
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+/* 64-bit unsigned long remainder. These are not normal C functions. Argument
|
||
|
|
+ registers are t10 and t11, the result goes in t12. Only t12 and AT may be
|
||
|
|
+ clobbered.
|
||
|
|
+
|
||
|
|
+ Theory of operation here is that we can use the FPU divider for virtually
|
||
|
|
+ all operands that we see: all dividend values between -2**53 and 2**53-1
|
||
|
|
+ can be computed directly. Note that divisor values need not be checked
|
||
|
|
+ against that range because the rounded fp value will be close enough such
|
||
|
|
+ that the quotient is < 1, which will properly be truncated to zero when we
|
||
|
|
+ convert back to integer.
|
||
|
|
+
|
||
|
|
+ When the dividend is outside the range for which we can compute exact
|
||
|
|
+ results, we use the fp quotent as an estimate from which we begin refining
|
||
|
|
+ an exact integral value. This reduces the number of iterations in the
|
||
|
|
+ shift-and-subtract loop significantly.
|
||
|
|
+
|
||
|
|
+ The FPCR save/restore is due to the fact that the SW6 _will_ set FPCR_INE
|
||
|
|
+ for cvttq/c even without /sui being set. It will not, however, properly
|
||
|
|
+ raise the exception, so we don't have to worry about FPCR_INED being clear
|
||
|
|
+ and so dying by SIGFPE. */
|
||
|
|
+ .text
|
||
|
|
+ .align 4
|
||
|
|
+ .globl __remlu
|
||
|
|
+ .type __remlu, @funcnoplt
|
||
|
|
+ .usepv __remlu, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (RA)
|
||
|
|
+__remlu:
|
||
|
|
+ ldi sp, -FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+ CALL_MCOUNT
|
||
|
|
+
|
||
|
|
+ /* Get the fp divide insn issued as quickly as possible. After
|
||
|
|
+ that's done, we have at least 22 cycles until its results are
|
||
|
|
+ ready -- all the time in the world to figure out how we're
|
||
|
|
+ going to use the results. */
|
||
|
|
+ subl Y, 1, AT
|
||
|
|
+ and Y, AT, AT
|
||
|
|
+ beq AT, $powerof2
|
||
|
|
+ fstd $f0, 0(sp)
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ fstd $f1, 8(sp)
|
||
|
|
+ fstd $f3, 48(sp)
|
||
|
|
+ fstd $f4, 56(sp)
|
||
|
|
+ fstd $f5, 64(sp)
|
||
|
|
+ cfi_rel_offset ($f0, 0)
|
||
|
|
+ cfi_rel_offset ($f1, 8)
|
||
|
|
+ cfi_rel_offset ($f3, 48)
|
||
|
|
+ cfi_rel_offset ($f4, 56)
|
||
|
|
+ cfi_rel_offset ($f5, 64)
|
||
|
|
+
|
||
|
|
+ rfpcr $f3
|
||
|
|
+ _ITOFT2 X, $f0, 16, Y, $f1, 24
|
||
|
|
+
|
||
|
|
+ fcvtld $f0, $f4
|
||
|
|
+ fcvtld $f1, $f5
|
||
|
|
+
|
||
|
|
+ blt X, $x_is_neg
|
||
|
|
+setfpec1
|
||
|
|
+ fdivd $f4, $f5, $f0
|
||
|
|
+
|
||
|
|
+ /* Check to see if Y was mis-converted as signed value. */
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ blt Y, $y_is_neg
|
||
|
|
+
|
||
|
|
+ /* Check to see if X fit in the double as an exact value. */
|
||
|
|
+ srl X, 53, AT
|
||
|
|
+ bne AT, $x_big
|
||
|
|
+
|
||
|
|
+ /* If we get here, we're expecting exact results from the division.
|
||
|
|
+ Do nothing else besides convert, compute remainder, clean up. */
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ _FTOIT $f4, AT, 16
|
||
|
|
+ mull AT, Y, AT
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ subl X, AT, RV
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+$x_is_neg:
|
||
|
|
+ /* If we get here, X is so big that bit 63 is set, which made the
|
||
|
|
+ conversion come out negative. Fix it up lest we not even get
|
||
|
|
+ a good estimate. */
|
||
|
|
+ ldih AT, 0x5f80 /* 2**64 as float. */
|
||
|
|
+ fstd $f2, 24(sp)
|
||
|
|
+ fstd $f6, 72(sp)
|
||
|
|
+ cfi_rel_offset ($f2, 24)
|
||
|
|
+ cfi_rel_offset ($f6, 72)
|
||
|
|
+ _ITOFS AT, $f2, 16
|
||
|
|
+ .align 4
|
||
|
|
+ faddd $f4, $f2, $f6
|
||
|
|
+ fdivd $f6, $f5, $f0
|
||
|
|
+
|
||
|
|
+ /* Ok, we've now the divide issued. Continue with other checks. */
|
||
|
|
+# .align 4
|
||
|
|
+ fldd $f1, 8(sp)
|
||
|
|
+ unop
|
||
|
|
+ fldd $f2, 24(sp)
|
||
|
|
+ fldd $f6, 72(sp)
|
||
|
|
+ blt Y, $y_is_neg
|
||
|
|
+ cfi_restore ($f1)
|
||
|
|
+ cfi_restore ($f2)
|
||
|
|
+ cfi_restore ($f6)
|
||
|
|
+ cfi_remember_state /* for y_is_neg */
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$x_big:
|
||
|
|
+ /* If we get here, X is large enough that we don't expect exact
|
||
|
|
+ results, and neither X nor Y got mis-translated for the fp
|
||
|
|
+ division. Our task is to take the fp result, figure out how
|
||
|
|
+ far it's off from the correct result and compute a fixup. */
|
||
|
|
+ stl t0, 32(sp)
|
||
|
|
+ stl t1, 40(sp)
|
||
|
|
+ stl t2, 16(sp)
|
||
|
|
+ stl t3, 24(sp)
|
||
|
|
+ cfi_rel_offset (t0, 32)
|
||
|
|
+ cfi_rel_offset (t1, 40)
|
||
|
|
+ cfi_rel_offset (t2, 16)
|
||
|
|
+ cfi_rel_offset (t3, 24)
|
||
|
|
+
|
||
|
|
+#define Q t0 /* quotient */
|
||
|
|
+#define R RV /* remainder */
|
||
|
|
+#define SY t1 /* scaled Y */
|
||
|
|
+#define S t2 /* scalar */
|
||
|
|
+#define QY t3 /* Q*Y */
|
||
|
|
+
|
||
|
|
+ fcvtdl_z $f0, $f4
|
||
|
|
+ _FTOIT $f4, Q, 8
|
||
|
|
+ mull Q, Y, QY
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ stl t4, 8(sp)
|
||
|
|
+ excb
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ cfi_rel_offset (t4, 8)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+
|
||
|
|
+ subl QY, X, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_high
|
||
|
|
+
|
||
|
|
+$q_high_ret:
|
||
|
|
+ subl X, QY, R
|
||
|
|
+ mov Y, SY
|
||
|
|
+ mov 1, S
|
||
|
|
+ bgt R, $q_low
|
||
|
|
+
|
||
|
|
+$q_low_ret:
|
||
|
|
+ ldl t4, 8(sp)
|
||
|
|
+ ldl t0, 32(sp)
|
||
|
|
+ ldl t1, 40(sp)
|
||
|
|
+ ldl t2, 16(sp)
|
||
|
|
+
|
||
|
|
+ ldl t3, 24(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_remember_state
|
||
|
|
+ cfi_restore (t0)
|
||
|
|
+ cfi_restore (t1)
|
||
|
|
+ cfi_restore (t2)
|
||
|
|
+ cfi_restore (t3)
|
||
|
|
+ cfi_restore (t4)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+ /* The quotient that we computed was too large. We need to reduce
|
||
|
|
+ it by S such that Y*S >= R. Obviously the closer we get to the
|
||
|
|
+ correct value the better, but overshooting high is ok, as we'll
|
||
|
|
+ fix that up later. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_high:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ subl Q, S, Q
|
||
|
|
+ unop
|
||
|
|
+ subl QY, SY, QY
|
||
|
|
+ br $q_high_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /* The quotient that we computed was too small. Divide Y by the
|
||
|
|
+ current remainder (R) and add that to the existing quotient (Q).
|
||
|
|
+ The expectation, of course, is that R is much smaller than X. */
|
||
|
|
+ /* Begin with a shift-up loop. Compute S such that Y*S >= R. We
|
||
|
|
+ already have a copy of Y in SY and the value 1 in S. */
|
||
|
|
+0:
|
||
|
|
+ addl SY, SY, SY
|
||
|
|
+ addl S, S, S
|
||
|
|
+$q_low:
|
||
|
|
+ cmpult SY, R, AT
|
||
|
|
+ bne AT, 0b
|
||
|
|
+
|
||
|
|
+ /* Shift-down and subtract loop. Each iteration compares our scaled
|
||
|
|
+ Y (SY) with the remainder (R); if SY <= R then X is divisible by
|
||
|
|
+ Y's scalar (S) so add it to the quotient (Q). */
|
||
|
|
+2: addl Q, S, t3
|
||
|
|
+ srl S, 1, S
|
||
|
|
+ cmpule SY, R, AT
|
||
|
|
+ subl R, SY, t4
|
||
|
|
+
|
||
|
|
+ selne AT, t3, Q, Q
|
||
|
|
+ selne AT, t4, R, R
|
||
|
|
+ srl SY, 1, SY
|
||
|
|
+ bne S, 2b
|
||
|
|
+
|
||
|
|
+ br $q_low_ret
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_restore_state
|
||
|
|
+$y_is_neg:
|
||
|
|
+ /* If we get here, Y is so big that bit 63 is set. The results
|
||
|
|
+ from the divide will be completely wrong. Fortunately, the
|
||
|
|
+ quotient must be either 0 or 1, so the remainder must be X
|
||
|
|
+ or X-Y, so just compute it directly. */
|
||
|
|
+ cmpule Y, X, AT
|
||
|
|
+ nop
|
||
|
|
+ wfpcr $f3
|
||
|
|
+ subl X, Y, RV
|
||
|
|
+ fldd $f0, 0(sp)
|
||
|
|
+ fldd $f3, 48(sp)
|
||
|
|
+ fldd $f4, 56(sp)
|
||
|
|
+ fldd $f5, 64(sp)
|
||
|
|
+ seleq AT, X, RV, RV
|
||
|
|
+
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_restore ($f0)
|
||
|
|
+ cfi_restore ($f3)
|
||
|
|
+ cfi_restore ($f4)
|
||
|
|
+ cfi_restore ($f5)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+ .align 4
|
||
|
|
+ cfi_def_cfa_offset (FRAME)
|
||
|
|
+$powerof2:
|
||
|
|
+ subl Y, 1, AT
|
||
|
|
+ beq Y, DIVBYZERO
|
||
|
|
+ and X, AT, RV
|
||
|
|
+ ldi sp, FRAME(sp)
|
||
|
|
+ cfi_def_cfa_offset (0)
|
||
|
|
+ ret $31, (RA), 1
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
+ .size __remlu, .-__remlu
|
||
|
|
+
|
||
|
|
+ DO_DIVBYZERO
|
||
|
|
diff --git a/sysdeps/sw_64/rshift.S b/sysdeps/sw_64/rshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..81b3d742
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/rshift.S
|
||
|
|
@@ -0,0 +1,105 @@
|
||
|
|
+ # Sw_64 1621 __mpn_rshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 4.8 cycles/limb on the 1621. With infinite unrolling,
|
||
|
|
+ # it would take 4 cycles/limb. It should be possible to get down to 3
|
||
|
|
+ # cycles/limb since both ldl and stl can be paired with the other used
|
||
|
|
+ # instructions. But there are many restrictions in the 1621 pipeline that
|
||
|
|
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
|
||
|
|
+
|
||
|
|
+ # 1. ldl has a 3 cycle delay, srl and sll have a 2 cycle delay.
|
||
|
|
+ # 2. Only aligned instruction pairs can be paired.
|
||
|
|
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_rshift
|
||
|
|
+ .ent __mpn_rshift
|
||
|
|
+__mpn_rshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ ldl $4,0($17) # load first limb
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ subl $31,$19,$7
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$20 # number of limbs in first loop
|
||
|
|
+ sll $4,$7,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $20,.L0
|
||
|
|
+ subl $18,$20,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0:
|
||
|
|
+ ldl $3,0($17)
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ subl $20,1,$20
|
||
|
|
+ srl $4,$19,$5
|
||
|
|
+ sll $3,$7,$6
|
||
|
|
+ bis $3,$3,$4
|
||
|
|
+ bis $5,$6,$8
|
||
|
|
+ stl $8,-8($16)
|
||
|
|
+ bne $20,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: beq $18,.Lend
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop: ldl $3,0($17)
|
||
|
|
+ addl $16,32,$16
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ srl $4,$19,$5
|
||
|
|
+ sll $3,$7,$6
|
||
|
|
+
|
||
|
|
+ ldl $4,8($17)
|
||
|
|
+ srl $3,$19,$1
|
||
|
|
+ bis $5,$6,$8
|
||
|
|
+ stl $8,-32($16)
|
||
|
|
+ sll $4,$7,$2
|
||
|
|
+
|
||
|
|
+ ldl $3,16($17)
|
||
|
|
+ srl $4,$19,$5
|
||
|
|
+ bis $1,$2,$8
|
||
|
|
+ stl $8,-24($16)
|
||
|
|
+ sll $3,$7,$6
|
||
|
|
+
|
||
|
|
+ ldl $4,24($17)
|
||
|
|
+ srl $3,$19,$1
|
||
|
|
+ bis $5,$6,$8
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ sll $4,$7,$2
|
||
|
|
+
|
||
|
|
+ addl $17,32,$17
|
||
|
|
+ bis $1,$2,$8
|
||
|
|
+ stl $8,-8($16)
|
||
|
|
+
|
||
|
|
+ bgt $18,.Loop
|
||
|
|
+
|
||
|
|
+.Lend: srl $4,$19,$8
|
||
|
|
+ stl $8,0($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_rshift
|
||
|
|
diff --git a/sysdeps/sw_64/sub_n.S b/sysdeps/sw_64/sub_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..d0d5a30c
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sub_n.S
|
||
|
|
@@ -0,0 +1,118 @@
|
||
|
|
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
|
||
|
|
+ # store difference in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_sub_n
|
||
|
|
+ .ent __mpn_sub_n
|
||
|
|
+__mpn_sub_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ ldl $3,0($17)
|
||
|
|
+ ldl $4,0($18)
|
||
|
|
+
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ and $19,4-1,$2 # number of limbs in first loop
|
||
|
|
+ bis $31,$31,$0
|
||
|
|
+ beq $2,.L0 # if fmuldiple of 4 limbs, skip first loop
|
||
|
|
+
|
||
|
|
+ subl $19,$2,$19
|
||
|
|
+
|
||
|
|
+.Loop0: subl $2,1,$2
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $4,$0,$4
|
||
|
|
+ ldl $6,8($18)
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ subl $3,$4,$4
|
||
|
|
+ cmpult $3,$4,$0
|
||
|
|
+ stl $4,0($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ bis $5,$5,$3
|
||
|
|
+ bis $6,$6,$4
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ bne $2,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: beq $19,.Lend
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop: subl $19,4,$19
|
||
|
|
+
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $4,$0,$4
|
||
|
|
+ ldl $6,8($18)
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ subl $3,$4,$4
|
||
|
|
+ cmpult $3,$4,$0
|
||
|
|
+ stl $4,0($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ ldl $3,16($17)
|
||
|
|
+ addl $6,$0,$6
|
||
|
|
+ ldl $4,16($18)
|
||
|
|
+ cmpult $6,$0,$1
|
||
|
|
+ subl $5,$6,$6
|
||
|
|
+ cmpult $5,$6,$0
|
||
|
|
+ stl $6,8($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ ldl $5,24($17)
|
||
|
|
+ addl $4,$0,$4
|
||
|
|
+ ldl $6,24($18)
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ subl $3,$4,$4
|
||
|
|
+ cmpult $3,$4,$0
|
||
|
|
+ stl $4,16($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ ldl $3,32($17)
|
||
|
|
+ addl $6,$0,$6
|
||
|
|
+ ldl $4,32($18)
|
||
|
|
+ cmpult $6,$0,$1
|
||
|
|
+ subl $5,$6,$6
|
||
|
|
+ cmpult $5,$6,$0
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+
|
||
|
|
+ addl $17,32,$17
|
||
|
|
+ addl $18,32,$18
|
||
|
|
+ addl $16,32,$16
|
||
|
|
+ bne $19,.Loop
|
||
|
|
+
|
||
|
|
+.Lend: addl $4,$0,$4
|
||
|
|
+ cmpult $4,$0,$1
|
||
|
|
+ subl $3,$4,$4
|
||
|
|
+ cmpult $3,$4,$0
|
||
|
|
+ stl $4,0($16)
|
||
|
|
+ or $0,$1,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ .end __mpn_sub_n
|
||
|
|
diff --git a/sysdeps/sw_64/submul_1.S b/sysdeps/sw_64/submul_1.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..2cad2bef
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/submul_1.S
|
||
|
|
@@ -0,0 +1,89 @@
|
||
|
|
+ # Sw_64 1621 __mpn_submul_1 -- Multiply a limb vector with a limb and
|
||
|
|
+ # fsubdract the result from a second limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # s2_limb r19
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_submul_1
|
||
|
|
+ .ent __mpn_submul_1 2
|
||
|
|
+__mpn_submul_1:
|
||
|
|
+ .frame $30,0,$26
|
||
|
|
+
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ ldl $5,0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2,$19,$0 # $0 = prod_high
|
||
|
|
+ beq $18,.Lend1 # jump if size was == 1
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ subl $5,$3,$3
|
||
|
|
+ cmpult $5,$3,$4
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $16,8,$16 # res_ptr++
|
||
|
|
+ beq $18,.Lend2 # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop: mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ ldl $5,0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $18,1,$18 # size--
|
||
|
|
+ umulh $2,$19,$4 # $4 = cy_limb
|
||
|
|
+ ldl $2,0($17) # $2 = s1_limb
|
||
|
|
+ addl $17,8,$17 # s1_ptr++
|
||
|
|
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ subl $5,$3,$3
|
||
|
|
+ cmpult $5,$3,$5
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $16,8,$16 # res_ptr++
|
||
|
|
+ addl $5,$0,$0 # combine carries
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+
|
||
|
|
+.Lend2: mull $2,$19,$3 # $3 = prod_low
|
||
|
|
+ ldl $5,0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4,$0,$0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2,$19,$4 # $4 = cy_limb
|
||
|
|
+ addl $3,$0,$3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ subl $5,$3,$3
|
||
|
|
+ cmpult $5,$3,$5
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $5,$0,$0 # combine carries
|
||
|
|
+ addl $4,$0,$0 # cy_limb = prod_high + cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+.Lend1: subl $5,$3,$3
|
||
|
|
+ cmpult $5,$3,$5
|
||
|
|
+ stl $3,0($16)
|
||
|
|
+ addl $0,$5,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ .end __mpn_submul_1
|
||
|
|
diff --git a/sysdeps/sw_64/sw6a/add_n.S b/sysdeps/sw_64/sw6a/add_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..86e9f9ae
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6a/add_n.S
|
||
|
|
@@ -0,0 +1,146 @@
|
||
|
|
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
|
||
|
|
+ # store sum in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_add_n
|
||
|
|
+ .ent __mpn_add_n
|
||
|
|
+__mpn_add_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ or $31,$31,$25 # clear cy
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
|
||
|
|
+ # Start software pipeline for 1st loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ addl $0,$4,$20 # 1st main add
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $20,$0,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ addl $5,$28,$21 # 2nd main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
|
||
|
|
+ # 1st loop handles groups of 4 limbs in a software pipeline
|
||
|
|
+ .align 4
|
||
|
|
+.Loop: cmpult $21,$28,$25 # compute cy from last add
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ addl $28,$6,$22 # 3rd main add
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $22,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ addl $28,$7,$23 # 4th main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $23,$28,$25 # compute cy from last add
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ addl $4,$28,$20 # 1st main add
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+ addl $5,$28,$21 # 2nd main add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ bge $19,.Loop
|
||
|
|
+ # Finish software pipeline for 1st loop
|
||
|
|
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ addl $28,$6,$22 # 3rd main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $22,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ addl $28,$7,$23 # 4th main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $23,$28,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+.Lend2: addl $19,4,$19 # restore loop cnt
|
||
|
|
+ beq $19,.Lret
|
||
|
|
+ # Start software pipeline for 2nd loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ beq $19,.Lend0
|
||
|
|
+ # 2nd loop handles remaining 1-3 limbs
|
||
|
|
+ .align 4
|
||
|
|
+.Loop0: addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $0,8($18)
|
||
|
|
+ addl $4,$28,$20 # main add
|
||
|
|
+ ldl $4,8($17)
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ subl $19,1,$19 # decr loop cnt
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ bne $19,.Loop0
|
||
|
|
+.Lend0: addl $0,$25,$28 # cy add
|
||
|
|
+ addl $4,$28,$20 # main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+
|
||
|
|
+.Lret: or $25,$31,$0 # return cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_add_n
|
||
|
|
diff --git a/sysdeps/sw_64/sw6a/addmul_1.S b/sysdeps/sw_64/sw6a/addmul_1.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..287e8573
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6a/addmul_1.S
|
||
|
|
@@ -0,0 +1,475 @@
|
||
|
|
+ # Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
|
||
|
|
+ # the result to a second limb vector.
|
||
|
|
+ #
|
||
|
|
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ #
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+ #
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published
|
||
|
|
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
|
||
|
|
+ # your option) any later version.
|
||
|
|
+ #
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+ #
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # size $18
|
||
|
|
+ # s2_limb $19
|
||
|
|
+ #
|
||
|
|
+ #
|
||
|
|
+ # This code was written in close cooperation with pipeline expert
|
||
|
|
+ # . Any errors are tege's fault, though.
|
||
|
|
+ #
|
||
|
|
+ # Register usages for unrolled loop:
|
||
|
|
+ # 0-3 mul's
|
||
|
|
+ # 4-7 acc's
|
||
|
|
+ # 8-15 mul results
|
||
|
|
+ # 20,21 carry's
|
||
|
|
+ # 22,23 save for stores
|
||
|
|
+ #
|
||
|
|
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
|
||
|
|
+ #
|
||
|
|
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
|
||
|
|
+ # them, so that further disturbance to the schedule is damped.
|
||
|
|
+ #
|
||
|
|
+ # We couldn't pair the loads, because the entangled schedule of the
|
||
|
|
+ # carry's has to happen on one side {0} of the machine. Note, the total
|
||
|
|
+ # use of U0, and the total use of L0 (after attending to the stores).
|
||
|
|
+ # which is part of the reason why....
|
||
|
|
+ #
|
||
|
|
+ # This is a great schedule for the d_cache, a poor schedule for the
|
||
|
|
+ # b_cache. The lockup on U0 means that any stall can't be recovered
|
||
|
|
+ # from. Consider a ldl in L1. say that load gets stalled because it
|
||
|
|
+ # collides with a fill from the b_Cache. On the next cycle, this load
|
||
|
|
+ # gets priority. If first looks at L0, and goes there. The instruction
|
||
|
|
+ # we intended for L0 gets to look at L1, which is NOT where we want
|
||
|
|
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
|
||
|
|
+ # causes a further instruction to stall.
|
||
|
|
+ #
|
||
|
|
+ # So for b_cache, we're likely going to want to put one or more cycles
|
||
|
|
+ # back into the code! And, of course, put in prefetches. For the
|
||
|
|
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
|
||
|
|
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
|
||
|
|
+ # 256 ahead of present pointer value. At a place where we have an mt
|
||
|
|
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
|
||
|
|
+ # prefetch into lower.
|
||
|
|
+ #
|
||
|
|
+ # Note, the usage of physical registers per cycle is smoothed off, as
|
||
|
|
+ # much as possible.
|
||
|
|
+ #
|
||
|
|
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
|
||
|
|
+ # like not to have a ldl or stl to preceded a conditional branch in a
|
||
|
|
+ # quadpack. The conditional branch moves the retire pointer one cycle
|
||
|
|
+ # later.
|
||
|
|
+ #
|
||
|
|
+ # Optimization notes:
|
||
|
|
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
|
||
|
|
+ # Reserved regs: $29 $30 $31
|
||
|
|
+ # Free caller-saves regs in unrolled code: $24 $25 $28
|
||
|
|
+ # We should swap some of the callee-saves regs for some of the free
|
||
|
|
+ # caller-saves regs, saving some overhead cycles.
|
||
|
|
+ # Most importantly, we should write fast code for the 0-7 case.
|
||
|
|
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
|
||
|
|
+ # on the 21264. Should not be hard, if we write specialized code for
|
||
|
|
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
|
||
|
|
+ # need a jump table indexed by the low 3 bits of the count argument.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+ .text
|
||
|
|
+
|
||
|
|
+ .globl __mpn_addmul_1
|
||
|
|
+ .ent __mpn_addmul_1
|
||
|
|
+__mpn_addmul_1:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+ .prologue 0
|
||
|
|
+
|
||
|
|
+ cmpult $18, 8, $1
|
||
|
|
+ beq $1, $Large
|
||
|
|
+
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2, $19, $0 # $0 = prod_high
|
||
|
|
+ beq $18, $Lend0b # jump if size was == 1
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $4
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ beq $18, $Lend0a # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$Loop0: mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ bne $18, $Loop0
|
||
|
|
+$Lend0a:
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ addl $4, $0, $0 # cy_limb = prod_high + cy
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+$Lend0b:
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $0, $5, $0
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+
|
||
|
|
+$Large:
|
||
|
|
+ ldi $30, -240($30)
|
||
|
|
+ stl $9, 8($30)
|
||
|
|
+ stl $10, 16($30)
|
||
|
|
+ stl $11, 24($30)
|
||
|
|
+ stl $12, 32($30)
|
||
|
|
+ stl $13, 40($30)
|
||
|
|
+ stl $14, 48($30)
|
||
|
|
+ stl $15, 56($30)
|
||
|
|
+
|
||
|
|
+ and $18, 7, $20 # count for the first loop, 0-7
|
||
|
|
+ srl $18, 3, $18 # count for unrolled loop
|
||
|
|
+ bis $31, $31, $0
|
||
|
|
+ beq $20, $Lunroll
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2, $19, $0 # $0 = prod_high
|
||
|
|
+ beq $20, $Lend1b # jump if size was == 1
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $4
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ beq $20, $Lend1a # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$Loop1: mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ bne $20, $Loop1
|
||
|
|
+
|
||
|
|
+$Lend1a:
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ addl $4, $0, $0 # cy_limb = prod_high + cy
|
||
|
|
+ br $31, $Lunroll
|
||
|
|
+$Lend1b:
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $0, $5, $0
|
||
|
|
+
|
||
|
|
+$Lunroll:
|
||
|
|
+ ldi $17, -16($17) # L1 bookkeeping
|
||
|
|
+ ldi $16, -16($16) # L1 bookkeeping
|
||
|
|
+ bis $0, $31, $12
|
||
|
|
+
|
||
|
|
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
|
||
|
|
+
|
||
|
|
+ ldl $2, 16($17) # L1
|
||
|
|
+ ldl $3, 24($17) # L1
|
||
|
|
+ ldi $18, -1($18) # L1 bookkeeping
|
||
|
|
+ ldl $6, 16($16) # L1
|
||
|
|
+ ldl $7, 24($16) # L1
|
||
|
|
+ ldl $0, 32($17) # L1
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ ldl $1, 40($17) # L1
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ ldi $17, 64($17) # L1 bookkeeping
|
||
|
|
+ ldl $4, 32($16) # L1
|
||
|
|
+ ldl $5, 40($16) # L1
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ ldl $2, -16($17) # L1
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ ldl $3, -8($17) # L1
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ ldi $16, 64($16) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, -16($16) # L1
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, -8($16) # L1
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 0($17) # L1
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 8($17) # L1
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # L0 lo + acc
|
||
|
|
+ stl $22, -48($16) # L0
|
||
|
|
+ stl $23, -40($16) # L1
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ addl $8, $21, $8 # U0 hi mul + carry
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ ble $18, $Lend # U1 bookkeeping
|
||
|
|
+
|
||
|
|
+ # ____ MAIN UNROLLED LOOP ____
|
||
|
|
+ .align 4
|
||
|
|
+$Loop:
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 0($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 8($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ ldl $2, 16($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ ldl $3, 24($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, -32($16) # L0
|
||
|
|
+ stl $23, -24($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $18, -1($18) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, 16($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, 24($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 32($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 40($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # U0 lo + acc
|
||
|
|
+ stl $22, -16($16) # L0
|
||
|
|
+ stl $23, -8($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $8, $21, $8 # L0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $17, 64($17) # L1 bookkeeping
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 32($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 40($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ ldl $2, -16($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ ldl $3, -8($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, 0($16) # L0
|
||
|
|
+ stl $23, 8($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $16, 64($16) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, -16($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, -8($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 0($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 8($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # L0 lo + acc
|
||
|
|
+ stl $22, -48($16) # L0
|
||
|
|
+ stl $23, -40($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $8, $21, $8 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ bis $31, $31, $31 # L1 mt
|
||
|
|
+ bgt $18, $Loop # U1 bookkeeping
|
||
|
|
+
|
||
|
|
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
|
||
|
|
+$Lend:
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 0($16) # L1
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 8($16) # L1
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, -32($16) # L0
|
||
|
|
+ stl $23, -24($16) # L1
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ addl $4, $9, $4 # U0 lo + acc
|
||
|
|
+ stl $22, -16($16) # L0
|
||
|
|
+ stl $23, -8($16) # L1
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ addl $8, $21, $8 # L0 hi mul + carry
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ stl $22, 0($16) # L0
|
||
|
|
+ stl $23, 8($16) # L1
|
||
|
|
+ addl $12, $21, $0 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ ldl $9, 8($30)
|
||
|
|
+ ldl $10, 16($30)
|
||
|
|
+ ldl $11, 24($30)
|
||
|
|
+ ldl $12, 32($30)
|
||
|
|
+ ldl $13, 40($30)
|
||
|
|
+ ldl $14, 48($30)
|
||
|
|
+ ldl $15, 56($30)
|
||
|
|
+ ldi $30, 240($30)
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+
|
||
|
|
+ .end __mpn_addmul_1
|
||
|
|
diff --git a/sysdeps/sw_64/sw6a/lshift.S b/sysdeps/sw_64/sw6a/lshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..cc00593c
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6a/lshift.S
|
||
|
|
@@ -0,0 +1,172 @@
|
||
|
|
+ # Sw_64 __mpn_lshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 3.25 cycles/limb on the sw_64.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_lshift
|
||
|
|
+ .ent __mpn_lshift
|
||
|
|
+__mpn_lshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ s8addl $18,$17,$17 # make r17 point at end of s1
|
||
|
|
+ ldl $4,-8($17) # load first limb
|
||
|
|
+ subl $31,$19,$20
|
||
|
|
+ s8addl $18,$16,$16 # make r16 point at end of RES
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$28 # number of limbs in first loop
|
||
|
|
+ srl $4,$20,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $28,.L0
|
||
|
|
+ subl $18,$28,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0: ldl $3,-16($17)
|
||
|
|
+ subl $16,8,$16
|
||
|
|
+ sll $4,$19,$5
|
||
|
|
+ subl $17,8,$17
|
||
|
|
+ subl $28,1,$28
|
||
|
|
+ srl $3,$20,$6
|
||
|
|
+ or $3,$3,$4
|
||
|
|
+ or $5,$6,$8
|
||
|
|
+ stl $8,0($16)
|
||
|
|
+ bne $28,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: sll $4,$19,$24
|
||
|
|
+ beq $18,.Lend
|
||
|
|
+ # warm up phase 1
|
||
|
|
+ ldl $1,-16($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ ldl $2,-24($17)
|
||
|
|
+ ldl $3,-32($17)
|
||
|
|
+ ldl $4,-40($17)
|
||
|
|
+ beq $18,.Lend1
|
||
|
|
+ # warm up phase 2
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ ldl $1,-48($17)
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ ldl $2,-56($17)
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ ldl $3,-64($17)
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ ldl $4,-72($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ beq $18,.Lend2
|
||
|
|
+ .align 4
|
||
|
|
+ # main loop
|
||
|
|
+.Loop: stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ ldl $1,-80($17)
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ ldl $2,-88($17)
|
||
|
|
+
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ subl $16,32,$16
|
||
|
|
+
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ ldl $3,-96($17)
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ ldl $4,-104($17)
|
||
|
|
+
|
||
|
|
+ subl $17,32,$17
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+ # cool down phase 2/1
|
||
|
|
+.Lend2: stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ # cool down phase 2/2
|
||
|
|
+ stl $7,-40($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-48($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,-56($16)
|
||
|
|
+ stl $6,-64($16)
|
||
|
|
+ # cool down phase 2/3
|
||
|
|
+ stl $24,-72($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ # cool down phase 1/1
|
||
|
|
+.Lend1: srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ # cool down phase 1/2
|
||
|
|
+ stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ stl $24,-40($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+.Lend: stl $24,-8($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_lshift
|
||
|
|
diff --git a/sysdeps/sw_64/sw6a/rshift.S b/sysdeps/sw_64/sw6a/rshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..416c3903
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6a/rshift.S
|
||
|
|
@@ -0,0 +1,170 @@
|
||
|
|
+ # Sw_64 __mpn_rshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 3.25 cycles/limb on the sw_64.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_rshift
|
||
|
|
+ .ent __mpn_rshift
|
||
|
|
+__mpn_rshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ ldl $4,0($17) # load first limb
|
||
|
|
+ subl $31,$19,$20
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$28 # number of limbs in first loop
|
||
|
|
+ sll $4,$20,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $28,.L0
|
||
|
|
+ subl $18,$28,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0: ldl $3,8($17)
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ srl $4,$19,$5
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ subl $28,1,$28
|
||
|
|
+ sll $3,$20,$6
|
||
|
|
+ or $3,$3,$4
|
||
|
|
+ or $5,$6,$8
|
||
|
|
+ stl $8,-8($16)
|
||
|
|
+ bne $28,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: srl $4,$19,$24
|
||
|
|
+ beq $18,.Lend
|
||
|
|
+ # warm up phase 1
|
||
|
|
+ ldl $1,8($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ ldl $2,16($17)
|
||
|
|
+ ldl $3,24($17)
|
||
|
|
+ ldl $4,32($17)
|
||
|
|
+ beq $18,.Lend1
|
||
|
|
+ # warm up phase 2
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ ldl $1,40($17)
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ ldl $2,48($17)
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ ldl $3,56($17)
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ ldl $4,64($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ beq $18,.Lend2
|
||
|
|
+ .align 4
|
||
|
|
+ # main loop
|
||
|
|
+.Loop: stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ ldl $1,72($17)
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ ldl $2,80($17)
|
||
|
|
+
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ addl $16,32,$16
|
||
|
|
+
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ ldl $3,88($17)
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ ldl $4,96($17)
|
||
|
|
+
|
||
|
|
+ addl $17,32,$17
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+ # cool down phase 2/1
|
||
|
|
+.Lend2: stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ # cool down phase 2/2
|
||
|
|
+ stl $7,32($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,40($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,48($16)
|
||
|
|
+ stl $6,56($16)
|
||
|
|
+ # cool down phase 2/3
|
||
|
|
+ stl $24,64($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ # cool down phase 1/1
|
||
|
|
+.Lend1: sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ # cool down phase 1/2
|
||
|
|
+ stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ stl $24,32($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+.Lend: stl $24,0($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_rshift
|
||
|
|
diff --git a/sysdeps/sw_64/sw6a/sub_n.S b/sysdeps/sw_64/sw6a/sub_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..95c257f7
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6a/sub_n.S
|
||
|
|
@@ -0,0 +1,147 @@
|
||
|
|
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
|
||
|
|
+ # store difference in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_sub_n
|
||
|
|
+ .ent __mpn_sub_n
|
||
|
|
+__mpn_sub_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ or $31,$31,$25 # clear cy
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
|
||
|
|
+ # Start software pipeline for 1st loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ subl $4,$0,$20 # 1st main sub
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last sub
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ subl $5,$28,$21 # 2nd main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
|
||
|
|
+ # 1st loop handles groups of 4 limbs in a software pipeline
|
||
|
|
+ .align 4
|
||
|
|
+.Loop: cmpult $5,$21,$25 # compute cy from last add
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $6,$28,$22 # 3rd main sub
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $6,$22,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ subl $7,$28,$23 # 4th main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $7,$23,$25 # compute cy from last add
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ subl $4,$28,$20 # 1st main sub
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+ subl $5,$28,$21 # 2nd main sub
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ bge $19,.Loop
|
||
|
|
+ # Finish software pipeline for 1st loop
|
||
|
|
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ subl $6,$28,$22 # 3rd main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $6,$22,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ subl $7,$28,$23 # 4th main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $7,$23,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+.Lend2: addl $19,4,$19 # restore loop cnt
|
||
|
|
+ beq $19,.Lret
|
||
|
|
+ # Start software pipeline for 2nd loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ beq $19,.Lend0
|
||
|
|
+ # 2nd loop handles remaining 1-3 limbs
|
||
|
|
+ .align 4
|
||
|
|
+.Loop0: addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $0,8($18)
|
||
|
|
+ subl $4,$28,$20 # main sub
|
||
|
|
+ ldl $1,8($17)
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ subl $19,1,$19 # decr loop cnt
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ or $1,$31,$4
|
||
|
|
+ bne $19,.Loop0
|
||
|
|
+.Lend0: addl $0,$25,$28 # cy add
|
||
|
|
+ subl $4,$28,$20 # main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+
|
||
|
|
+.Lret: or $25,$31,$0 # return cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_sub_n
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/add_n.S b/sysdeps/sw_64/sw6b/add_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..86e9f9ae
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/add_n.S
|
||
|
|
@@ -0,0 +1,146 @@
|
||
|
|
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
|
||
|
|
+ # store sum in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_add_n
|
||
|
|
+ .ent __mpn_add_n
|
||
|
|
+__mpn_add_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ or $31,$31,$25 # clear cy
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
|
||
|
|
+ # Start software pipeline for 1st loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ addl $0,$4,$20 # 1st main add
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $20,$0,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ addl $5,$28,$21 # 2nd main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
|
||
|
|
+ # 1st loop handles groups of 4 limbs in a software pipeline
|
||
|
|
+ .align 4
|
||
|
|
+.Loop: cmpult $21,$28,$25 # compute cy from last add
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ addl $28,$6,$22 # 3rd main add
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $22,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ addl $28,$7,$23 # 4th main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $23,$28,$25 # compute cy from last add
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ addl $4,$28,$20 # 1st main add
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+ addl $5,$28,$21 # 2nd main add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ bge $19,.Loop
|
||
|
|
+ # Finish software pipeline for 1st loop
|
||
|
|
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ addl $28,$6,$22 # 3rd main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $22,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ addl $28,$7,$23 # 4th main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $23,$28,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+.Lend2: addl $19,4,$19 # restore loop cnt
|
||
|
|
+ beq $19,.Lret
|
||
|
|
+ # Start software pipeline for 2nd loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ beq $19,.Lend0
|
||
|
|
+ # 2nd loop handles remaining 1-3 limbs
|
||
|
|
+ .align 4
|
||
|
|
+.Loop0: addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $0,8($18)
|
||
|
|
+ addl $4,$28,$20 # main add
|
||
|
|
+ ldl $4,8($17)
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ subl $19,1,$19 # decr loop cnt
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ bne $19,.Loop0
|
||
|
|
+.Lend0: addl $0,$25,$28 # cy add
|
||
|
|
+ addl $4,$28,$20 # main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+
|
||
|
|
+.Lret: or $25,$31,$0 # return cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_add_n
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/addmul_1.S b/sysdeps/sw_64/sw6b/addmul_1.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..a288f040
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/addmul_1.S
|
||
|
|
@@ -0,0 +1,475 @@
|
||
|
|
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
|
||
|
|
+ # the result to a second limb vector.
|
||
|
|
+ #
|
||
|
|
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ #
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+ #
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published
|
||
|
|
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
|
||
|
|
+ # your option) any later version.
|
||
|
|
+ #
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+ #
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # size $18
|
||
|
|
+ # s2_limb $19
|
||
|
|
+ #
|
||
|
|
+ #
|
||
|
|
+ # This code was written in close cooperation with pipeline expert
|
||
|
|
+ # . Any errors are tege's fault, though.
|
||
|
|
+ #
|
||
|
|
+ # Register usages for unrolled loop:
|
||
|
|
+ # 0-3 mul's
|
||
|
|
+ # 4-7 acc's
|
||
|
|
+ # 8-15 mul results
|
||
|
|
+ # 20,21 carry's
|
||
|
|
+ # 22,23 save for stores
|
||
|
|
+ #
|
||
|
|
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
|
||
|
|
+ #
|
||
|
|
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
|
||
|
|
+ # them, so that further disturbance to the schedule is damped.
|
||
|
|
+ #
|
||
|
|
+ # We couldn't pair the loads, because the entangled schedule of the
|
||
|
|
+ # carry's has to happen on one side {0} of the machine. Note, the total
|
||
|
|
+ # use of U0, and the total use of L0 (after attending to the stores).
|
||
|
|
+ # which is part of the reason why....
|
||
|
|
+ #
|
||
|
|
+ # This is a great schedule for the d_cache, a poor schedule for the
|
||
|
|
+ # b_cache. The lockup on U0 means that any stall can't be recovered
|
||
|
|
+ # from. Consider a ldl in L1. say that load gets stalled because it
|
||
|
|
+ # collides with a fill from the b_Cache. On the next cycle, this load
|
||
|
|
+ # gets priority. If first looks at L0, and goes there. The instruction
|
||
|
|
+ # we intended for L0 gets to look at L1, which is NOT where we want
|
||
|
|
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
|
||
|
|
+ # causes a further instruction to stall.
|
||
|
|
+ #
|
||
|
|
+ # So for b_cache, we're likely going to want to put one or more cycles
|
||
|
|
+ # back into the code! And, of course, put in prefetches. For the
|
||
|
|
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
|
||
|
|
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
|
||
|
|
+ # 256 ahead of present pointer value. At a place where we have an mt
|
||
|
|
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
|
||
|
|
+ # prefetch into lower.
|
||
|
|
+ #
|
||
|
|
+ # Note, the usage of physical registers per cycle is smoothed off, as
|
||
|
|
+ # much as possible.
|
||
|
|
+ #
|
||
|
|
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
|
||
|
|
+ # like not to have a ldl or stl to preceded a conditional branch in a
|
||
|
|
+ # quadpack. The conditional branch moves the retire pointer one cycle
|
||
|
|
+ # later.
|
||
|
|
+ #
|
||
|
|
+ # Optimization notes:
|
||
|
|
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
|
||
|
|
+ # Reserved regs: $29 $30 $31
|
||
|
|
+ # Free caller-saves regs in unrolled code: $24 $25 $28
|
||
|
|
+ # We should swap some of the callee-saves regs for some of the free
|
||
|
|
+ # caller-saves regs, saving some overhead cycles.
|
||
|
|
+ # Most importantly, we should write fast code for the 0-7 case.
|
||
|
|
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
|
||
|
|
+ # on the 21264. Should not be hard, if we write specialized code for
|
||
|
|
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
|
||
|
|
+ # need a jump table indexed by the low 3 bits of the count argument.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+ .text
|
||
|
|
+
|
||
|
|
+ .globl __mpn_addmul_1
|
||
|
|
+ .ent __mpn_addmul_1
|
||
|
|
+__mpn_addmul_1:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+ .prologue 0
|
||
|
|
+
|
||
|
|
+ cmpult $18, 8, $1
|
||
|
|
+ beq $1, $Large
|
||
|
|
+
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2, $19, $0 # $0 = prod_high
|
||
|
|
+ beq $18, $Lend0b # jump if size was == 1
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $4
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ beq $18, $Lend0a # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$Loop0: mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ bne $18, $Loop0
|
||
|
|
+$Lend0a:
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ addl $4, $0, $0 # cy_limb = prod_high + cy
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+$Lend0b:
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $0, $5, $0
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+
|
||
|
|
+$Large:
|
||
|
|
+ ldi $30, -240($30)
|
||
|
|
+ stl $9, 8($30)
|
||
|
|
+ stl $10, 16($30)
|
||
|
|
+ stl $11, 24($30)
|
||
|
|
+ stl $12, 32($30)
|
||
|
|
+ stl $13, 40($30)
|
||
|
|
+ stl $14, 48($30)
|
||
|
|
+ stl $15, 56($30)
|
||
|
|
+
|
||
|
|
+ and $18, 7, $20 # count for the first loop, 0-7
|
||
|
|
+ srl $18, 3, $18 # count for unrolled loop
|
||
|
|
+ bis $31, $31, $0
|
||
|
|
+ beq $20, $Lunroll
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2, $19, $0 # $0 = prod_high
|
||
|
|
+ beq $20, $Lend1b # jump if size was == 1
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $4
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ beq $20, $Lend1a # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$Loop1: mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ bne $20, $Loop1
|
||
|
|
+
|
||
|
|
+$Lend1a:
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ addl $4, $0, $0 # cy_limb = prod_high + cy
|
||
|
|
+ br $31, $Lunroll
|
||
|
|
+$Lend1b:
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $0, $5, $0
|
||
|
|
+
|
||
|
|
+$Lunroll:
|
||
|
|
+ ldi $17, -16($17) # L1 bookkeeping
|
||
|
|
+ ldi $16, -16($16) # L1 bookkeeping
|
||
|
|
+ bis $0, $31, $12
|
||
|
|
+
|
||
|
|
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
|
||
|
|
+
|
||
|
|
+ ldl $2, 16($17) # L1
|
||
|
|
+ ldl $3, 24($17) # L1
|
||
|
|
+ ldi $18, -1($18) # L1 bookkeeping
|
||
|
|
+ ldl $6, 16($16) # L1
|
||
|
|
+ ldl $7, 24($16) # L1
|
||
|
|
+ ldl $0, 32($17) # L1
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ ldl $1, 40($17) # L1
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ ldi $17, 64($17) # L1 bookkeeping
|
||
|
|
+ ldl $4, 32($16) # L1
|
||
|
|
+ ldl $5, 40($16) # L1
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ ldl $2, -16($17) # L1
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ ldl $3, -8($17) # L1
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ ldi $16, 64($16) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, -16($16) # L1
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, -8($16) # L1
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 0($17) # L1
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 8($17) # L1
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # L0 lo + acc
|
||
|
|
+ stl $22, -48($16) # L0
|
||
|
|
+ stl $23, -40($16) # L1
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ addl $8, $21, $8 # U0 hi mul + carry
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ ble $18, $Lend # U1 bookkeeping
|
||
|
|
+
|
||
|
|
+ # ____ MAIN UNROLLED LOOP ____
|
||
|
|
+ .align 4
|
||
|
|
+$Loop:
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 0($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 8($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ ldl $2, 16($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ ldl $3, 24($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, -32($16) # L0
|
||
|
|
+ stl $23, -24($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $18, -1($18) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, 16($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, 24($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 32($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 40($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # U0 lo + acc
|
||
|
|
+ stl $22, -16($16) # L0
|
||
|
|
+ stl $23, -8($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $8, $21, $8 # L0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $17, 64($17) # L1 bookkeeping
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 32($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 40($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ ldl $2, -16($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ ldl $3, -8($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, 0($16) # L0
|
||
|
|
+ stl $23, 8($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $16, 64($16) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, -16($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, -8($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 0($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 8($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # L0 lo + acc
|
||
|
|
+ stl $22, -48($16) # L0
|
||
|
|
+ stl $23, -40($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $8, $21, $8 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ bis $31, $31, $31 # L1 mt
|
||
|
|
+ bgt $18, $Loop # U1 bookkeeping
|
||
|
|
+
|
||
|
|
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
|
||
|
|
+$Lend:
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 0($16) # L1
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 8($16) # L1
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, -32($16) # L0
|
||
|
|
+ stl $23, -24($16) # L1
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ addl $4, $9, $4 # U0 lo + acc
|
||
|
|
+ stl $22, -16($16) # L0
|
||
|
|
+ stl $23, -8($16) # L1
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ addl $8, $21, $8 # L0 hi mul + carry
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ stl $22, 0($16) # L0
|
||
|
|
+ stl $23, 8($16) # L1
|
||
|
|
+ addl $12, $21, $0 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ ldl $9, 8($30)
|
||
|
|
+ ldl $10, 16($30)
|
||
|
|
+ ldl $11, 24($30)
|
||
|
|
+ ldl $12, 32($30)
|
||
|
|
+ ldl $13, 40($30)
|
||
|
|
+ ldl $14, 48($30)
|
||
|
|
+ ldl $15, 56($30)
|
||
|
|
+ ldi $30, 240($30)
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+
|
||
|
|
+ .end __mpn_addmul_1
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/lshift.S b/sysdeps/sw_64/sw6b/lshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..cc00593c
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/lshift.S
|
||
|
|
@@ -0,0 +1,172 @@
|
||
|
|
+ # Sw_64 __mpn_lshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 3.25 cycles/limb on the sw_64.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_lshift
|
||
|
|
+ .ent __mpn_lshift
|
||
|
|
+__mpn_lshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ s8addl $18,$17,$17 # make r17 point at end of s1
|
||
|
|
+ ldl $4,-8($17) # load first limb
|
||
|
|
+ subl $31,$19,$20
|
||
|
|
+ s8addl $18,$16,$16 # make r16 point at end of RES
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$28 # number of limbs in first loop
|
||
|
|
+ srl $4,$20,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $28,.L0
|
||
|
|
+ subl $18,$28,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0: ldl $3,-16($17)
|
||
|
|
+ subl $16,8,$16
|
||
|
|
+ sll $4,$19,$5
|
||
|
|
+ subl $17,8,$17
|
||
|
|
+ subl $28,1,$28
|
||
|
|
+ srl $3,$20,$6
|
||
|
|
+ or $3,$3,$4
|
||
|
|
+ or $5,$6,$8
|
||
|
|
+ stl $8,0($16)
|
||
|
|
+ bne $28,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: sll $4,$19,$24
|
||
|
|
+ beq $18,.Lend
|
||
|
|
+ # warm up phase 1
|
||
|
|
+ ldl $1,-16($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ ldl $2,-24($17)
|
||
|
|
+ ldl $3,-32($17)
|
||
|
|
+ ldl $4,-40($17)
|
||
|
|
+ beq $18,.Lend1
|
||
|
|
+ # warm up phase 2
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ ldl $1,-48($17)
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ ldl $2,-56($17)
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ ldl $3,-64($17)
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ ldl $4,-72($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ beq $18,.Lend2
|
||
|
|
+ .align 4
|
||
|
|
+ # main loop
|
||
|
|
+.Loop: stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ ldl $1,-80($17)
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ ldl $2,-88($17)
|
||
|
|
+
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ subl $16,32,$16
|
||
|
|
+
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ ldl $3,-96($17)
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ ldl $4,-104($17)
|
||
|
|
+
|
||
|
|
+ subl $17,32,$17
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+ # cool down phase 2/1
|
||
|
|
+.Lend2: stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ # cool down phase 2/2
|
||
|
|
+ stl $7,-40($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-48($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,-56($16)
|
||
|
|
+ stl $6,-64($16)
|
||
|
|
+ # cool down phase 2/3
|
||
|
|
+ stl $24,-72($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ # cool down phase 1/1
|
||
|
|
+.Lend1: srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ # cool down phase 1/2
|
||
|
|
+ stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ stl $24,-40($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+.Lend: stl $24,-8($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_lshift
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/memcpy.S b/sysdeps/sw_64/sw6b/memcpy.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..938ebdfc
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/memcpy.S
|
||
|
|
@@ -0,0 +1,416 @@
|
||
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+ sw6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+/*
|
||
|
|
+ * Much of the information about 21264 scheduling/coding comes from:
|
||
|
|
+ * Compiler Writer's Guide for the Sw_64 21264
|
||
|
|
+ * abbreviated as 'CWG' in other comments here
|
||
|
|
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
|
||
|
|
+ * Scheduling notation:
|
||
|
|
+ * E - either cluster
|
||
|
|
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
|
||
|
|
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
|
||
|
|
+ *
|
||
|
|
+ * Temp usage notes:
|
||
|
|
+ * $0 - destination address
|
||
|
|
+ * $1,$2, - scratch
|
||
|
|
+ */
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ .arch ev6
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+
|
||
|
|
+ .type $jmppointh,@object
|
||
|
|
+$jumppointh:
|
||
|
|
+ .gprel32 $both_0mod8
|
||
|
|
+ .gprel32 J$H01
|
||
|
|
+ .gprel32 J$H02
|
||
|
|
+ .gprel32 J$H03
|
||
|
|
+ .gprel32 J$H04
|
||
|
|
+ .gprel32 J$H05
|
||
|
|
+ .gprel32 J$H06
|
||
|
|
+ .gprel32 J$H07
|
||
|
|
+
|
||
|
|
+ENTRY(memcpy)
|
||
|
|
+ .prologue 1
|
||
|
|
+ ldgp $29, 0($27)
|
||
|
|
+ mov $16, $0 # E : copy dest to return
|
||
|
|
+ ble $18, $nomoredata # U : done with the copy?
|
||
|
|
+ cmplt $18, 8, $1
|
||
|
|
+ bne $1, $less_8
|
||
|
|
+ xor $16, $17, $1 # E : are source and dest alignments the same?
|
||
|
|
+ and $1, 7, $1 # E : are they the same mod 8?
|
||
|
|
+
|
||
|
|
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
|
||
|
|
+ /* source and dest are same mod 8 address */
|
||
|
|
+ and $16, 7, $1 # E : Are both 0mod8?
|
||
|
|
+ beq $1, $both_0mod8 # U : Yes
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * source and dest are same misalignment. move a byte at a time
|
||
|
|
+ * until a 0mod8 alignment for both is reached.
|
||
|
|
+ * At least one byte more to move
|
||
|
|
+ */
|
||
|
|
+
|
||
|
|
+ ldi $2, 8
|
||
|
|
+ subl $2, $1, $1
|
||
|
|
+
|
||
|
|
+$head_align:
|
||
|
|
+ addl $16, $1, $16
|
||
|
|
+ addl $17, $1, $17
|
||
|
|
+ subl $18, $1, $18
|
||
|
|
+ ldih $2, $jumppointh($29) !gprelhigh
|
||
|
|
+ s4addl $1, $2, $2
|
||
|
|
+ ldw $2, $jumppointh($2) !gprellow
|
||
|
|
+ addl $2, $29, $2
|
||
|
|
+ jmp ($2)
|
||
|
|
+
|
||
|
|
+$both_0mod8:
|
||
|
|
+ cmple $18, 127, $1 # E : Can we unroll the loop?
|
||
|
|
+ bne $1, $no_unroll # U :
|
||
|
|
+ and $16, 63, $1 # E : get mod64 alignment
|
||
|
|
+ beq $1, $do_unroll # U : no single quads to fiddle
|
||
|
|
+
|
||
|
|
+$single_head_quad:
|
||
|
|
+ ldl $1, 0($17) # L : get 8 bytes
|
||
|
|
+ subl $18, 8, $18 # E : count -= 8
|
||
|
|
+ addl $17, 8, $17 # E : src += 8
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stl $1, 0($16) # L : store
|
||
|
|
+ addl $16, 8, $16 # E : dest += 8
|
||
|
|
+ and $16, 63, $1 # E : get mod64 alignment
|
||
|
|
+ bne $1, $single_head_quad # U : still not fully aligned
|
||
|
|
+
|
||
|
|
+$do_unroll:
|
||
|
|
+ ldih $1, 8($31) # big than 512K
|
||
|
|
+ cmple $18, $1, $1
|
||
|
|
+ beq $1, $unroll_body_512
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ cmple $18, 63, $1 # E : Can we go through the unrolled loop?
|
||
|
|
+ bne $1, $tail_quads # U : Nope
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+$unroll_body:
|
||
|
|
+ ldl $6, 0($17) # L0 : bytes 0..7
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
||
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
||
|
|
+ addl $16, 64, $1 # E : fallback value for wh64
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ addl $17, 32, $17 # E : src += 32 bytes
|
||
|
|
+ stl $6, 0($16) # L : bytes 0..7
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stl $4, 8($16) # L : bytes 8..15
|
||
|
|
+ stl $5, 16($16) # L : bytes 16..23
|
||
|
|
+ subl $18, 192, $2 # E : At least two more trips to go?
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stl $3, 24($16) # L : bytes 24..31
|
||
|
|
+ addl $16, 32, $16 # E : dest += 32 bytes
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $6, 0($17) # L : bytes 0..7
|
||
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
||
|
|
+ # fallback wh64 address if < 2 more trips
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
||
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
||
|
|
+ addl $16, 32, $16 # E : dest += 32
|
||
|
|
+ subl $18, 64, $18 # E : count -= 64
|
||
|
|
+
|
||
|
|
+ addl $17, 32, $17 # E : src += 32
|
||
|
|
+ stl $6, -32($16) # L : bytes 0..7
|
||
|
|
+ stl $4, -24($16) # L : bytes 8..15
|
||
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
||
|
|
+
|
||
|
|
+ stl $5, -16($16) # L : bytes 16..23
|
||
|
|
+ stl $3, -8($16) # L : bytes 24..31
|
||
|
|
+ nop # E :
|
||
|
|
+ beq $1, $unroll_body
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ br $tail_quads
|
||
|
|
+
|
||
|
|
+$unroll_body_512:
|
||
|
|
+ fillcs 128*4($17)
|
||
|
|
+ e_fillcs 128*20($17)
|
||
|
|
+
|
||
|
|
+ fillcs 128*3($16) #add by ZJ20220620 stl_nc->stl
|
||
|
|
+ e_fillcs 128*7($16)
|
||
|
|
+
|
||
|
|
+ ldl $6, 0($17) # L0 : bytes 0..7
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
||
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
||
|
|
+ addl $16, 64, $1 # E : fallback value for wh64
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ addl $17, 32, $17 # E : src += 32 bytes
|
||
|
|
+ stl $6, 0($16) # L : bytes 0..7
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stl $4, 8($16) # L : bytes 8..15
|
||
|
|
+ stl $5, 16($16) # L : bytes 16..23
|
||
|
|
+ subl $18, 192, $2 # E : At least two more trips to go?
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stl $3, 24($16) # L : bytes 24..31
|
||
|
|
+ addl $16, 32, $16 # E : dest += 32 bytes
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $6, 0($17) # L : bytes 0..7
|
||
|
|
+ ldl $4, 8($17) # L : bytes 8..15
|
||
|
|
+ # fallback wh64 address if < 2 more trips
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ ldl $5, 16($17) # L : bytes 16..23
|
||
|
|
+ ldl $3, 24($17) # L : bytes 24..31
|
||
|
|
+ addl $16, 32, $16 # E : dest += 32
|
||
|
|
+ subl $18, 64, $18 # E : count -= 64
|
||
|
|
+
|
||
|
|
+ addl $17, 32, $17 # E : src += 32
|
||
|
|
+ stl $6, -32($16) # L : bytes 0..7
|
||
|
|
+ stl $4, -24($16) # L : bytes 8..15
|
||
|
|
+ cmple $18, 63, $1 # E : At least one more trip?
|
||
|
|
+
|
||
|
|
+ stl $5, -16($16) # L : bytes 16..23
|
||
|
|
+ stl $3, -8($16) # L : bytes 24..31
|
||
|
|
+ nop # E :
|
||
|
|
+ beq $1, $unroll_body_512
|
||
|
|
+
|
||
|
|
+$tail_quads:
|
||
|
|
+$no_unroll:
|
||
|
|
+ .align 4
|
||
|
|
+ subl $18, 8, $18 # E : At least a quad left?
|
||
|
|
+ blt $18, $less_than_8 # U : Nope
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+$move_a_quad:
|
||
|
|
+ ldl $1, 0($17) # L : fetch 8
|
||
|
|
+ subl $18, 8, $18 # E : count -= 8
|
||
|
|
+ addl $17, 8, $17 # E : src += 8
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stl $1, 0($16) # L : store 8
|
||
|
|
+ addl $16, 8, $16 # E : dest += 8
|
||
|
|
+ bge $18, $move_a_quad # U :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+$less_than_8:
|
||
|
|
+ .align 4
|
||
|
|
+ addl $18, 8, $18 # E : add back for trailing bytes
|
||
|
|
+ ble $18, $nomoredata # U : All-done
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ /* Trailing bytes */
|
||
|
|
+$tail_bytes:
|
||
|
|
+ subl $18, 1, $18 # E : count--
|
||
|
|
+ ldbu $1, 0($17) # L : fetch a byte
|
||
|
|
+ addl $17, 1, $17 # E : src++
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stb $1, 0($16) # L : store a byte
|
||
|
|
+ addl $16, 1, $16 # E : dest++
|
||
|
|
+ bgt $18, $tail_bytes # U : more to be done?
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
|
||
|
|
+ ret $31, ($26), 1 # L0 :
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+$misaligned:
|
||
|
|
+ mov $0, $4 # E : dest temp
|
||
|
|
+ and $0, 7, $1 # E : dest alignment mod8
|
||
|
|
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+$aligndest:
|
||
|
|
+ ble $18, $nomoredata # U :
|
||
|
|
+ ldbu $1, 0($17) # L : fetch a byte
|
||
|
|
+ subl $18, 1, $18 # E : count--
|
||
|
|
+ addl $17, 1, $17 # E : src++
|
||
|
|
+
|
||
|
|
+ stb $1, 0($4) # L : store it
|
||
|
|
+ addl $4, 1, $4 # E : dest++
|
||
|
|
+ and $4, 7, $1 # E : dest 0mod8 yet?
|
||
|
|
+ bne $1, $aligndest # U : go until we are aligned.
|
||
|
|
+
|
||
|
|
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
|
||
|
|
+$dest_0mod8:
|
||
|
|
+ subl $18, 8, $18 # E : At least a quad left?
|
||
|
|
+ blt $18, $misalign_tail # U : Nope
|
||
|
|
+ ldl_u $3, 0($17) # L : seed (rotating load) of 8 bytes
|
||
|
|
+ ldih $1, 8($31)
|
||
|
|
+ subl $1, 8, $1
|
||
|
|
+ cmple $18, $1, $1
|
||
|
|
+ beq $1, $mis_quad_big # big than 512K
|
||
|
|
+
|
||
|
|
+$mis_quad:
|
||
|
|
+ ldl_u $16, 8($17) # L : Fetch next 8
|
||
|
|
+ ext3b $3, $17, $3 # U : masking
|
||
|
|
+ ext7b $16, $17, $1 # U : masking
|
||
|
|
+ bis $3, $1, $1 # E : merged bytes to store
|
||
|
|
+
|
||
|
|
+ subl $18, 8, $18 # E : count -= 8
|
||
|
|
+ addl $17, 8, $17 # E : src += 8
|
||
|
|
+ stl $1, 0($4) # L : store 8 (aligned)
|
||
|
|
+ mov $16, $3 # E : "rotate" source data
|
||
|
|
+
|
||
|
|
+ addl $4, 8, $4 # E : dest += 8
|
||
|
|
+ bge $18, $mis_quad # U : More quads to move
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ br $misalign_tail
|
||
|
|
+
|
||
|
|
+$mis_quad_big:
|
||
|
|
+ fillcs 128*4($17)
|
||
|
|
+ e_fillcs 128*20($17)
|
||
|
|
+ ldl_u $16, 8($17) # L : Fetch next 8
|
||
|
|
+ ext3b $3, $17, $3 # U : masking
|
||
|
|
+ ext7b $16, $17, $1 # U : masking
|
||
|
|
+ bis $3, $1, $1 # E : merged bytes to store
|
||
|
|
+
|
||
|
|
+ fillcs 128*9($17) #add by ZJ20220620 stl_nc->stl
|
||
|
|
+ e_fillcs 128*15($17)
|
||
|
|
+
|
||
|
|
+ subl $18, 8, $18 # E : count -= 8
|
||
|
|
+ addl $17, 8, $17 # E : src += 8
|
||
|
|
+ stl $1, 0($4) # L : store 8 (aligned)
|
||
|
|
+ mov $16, $3 # E : "rotate" source data
|
||
|
|
+
|
||
|
|
+ addl $4, 8, $4 # E : dest += 8
|
||
|
|
+ bge $18, $mis_quad_big # U : More quads to move
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+$misalign_tail:
|
||
|
|
+ addl $18, 8, $18 # E : account for tail stuff
|
||
|
|
+ ble $18, $nomoredata # U :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+$misalign_byte:
|
||
|
|
+ ldbu $1, 0($17) # L : fetch 1
|
||
|
|
+ subl $18, 1, $18 # E : count--
|
||
|
|
+ addl $17, 1, $17 # E : src++
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stb $1, 0($4) # L : store
|
||
|
|
+ addl $4, 1, $4 # E : dest++
|
||
|
|
+ bgt $18, $misalign_byte # U : more to go?
|
||
|
|
+ nop
|
||
|
|
+ br $nomoredata
|
||
|
|
+
|
||
|
|
+$less_8:
|
||
|
|
+ ldbu $1, 0($17) # L : fetch 1
|
||
|
|
+ subl $18, 1, $18 # E : count--
|
||
|
|
+ addl $17, 1, $17 # E : src++
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ stb $1, 0($16) # L : store
|
||
|
|
+ addl $16, 1, $16 # E : dest++
|
||
|
|
+ bgt $18, $less_8 # U : more to go?
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+$nomoredata:
|
||
|
|
+ ret $31, ($26), 1 # L0 :
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+J$H01:
|
||
|
|
+ ldbu $1,-1($17)
|
||
|
|
+ stb $1,-1($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+J$H02:
|
||
|
|
+ ldh $1,-2($17)
|
||
|
|
+ sth $1,-2($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+J$H03:
|
||
|
|
+ ldh $1,-2($17)
|
||
|
|
+ ldbu $2,-3($17)
|
||
|
|
+ sth $1,-2($16)
|
||
|
|
+ stb $2,-3($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+J$H04:
|
||
|
|
+ ldw $1,-4($17)
|
||
|
|
+ stw $1,-4($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+J$H05:
|
||
|
|
+ ldw $1,-4($17)
|
||
|
|
+ ldbu $2,-5($17)
|
||
|
|
+ stw $1,-4($16)
|
||
|
|
+ stb $2,-5($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+J$H06:
|
||
|
|
+ ldw $1,-4($17)
|
||
|
|
+ ldh $2,-6($17)
|
||
|
|
+ stw $1,-4($16)
|
||
|
|
+ sth $2,-6($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+J$H07:
|
||
|
|
+ ldw $1,-4($17)
|
||
|
|
+ ldh $2,-6($17)
|
||
|
|
+ ldbu $3,-7($17)
|
||
|
|
+ stw $1,-4($16)
|
||
|
|
+ sth $2,-6($16)
|
||
|
|
+ stb $3,-7($16)
|
||
|
|
+ br $both_0mod8
|
||
|
|
+
|
||
|
|
+END(memcpy)
|
||
|
|
+libc_hidden_builtin_def (memcpy)
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/memset.S b/sysdeps/sw_64/sw6b/memset.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..0085ac70
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/memset.S
|
||
|
|
@@ -0,0 +1,312 @@
|
||
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
||
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ .arch sw6b
|
||
|
|
+ .set noat
|
||
|
|
+ .set noreorder
|
||
|
|
+
|
||
|
|
+ENTRY(memset)
|
||
|
|
+#ifdef PROF
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .prologue 1
|
||
|
|
+#else
|
||
|
|
+ .prologue 0
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Serious stalling happens. The only way to mitigate this is to
|
||
|
|
+ * undertake a major re-write to interleave the constant materialization
|
||
|
|
+ * with other parts of the fall-through code. This is important, even
|
||
|
|
+ * though it makes maintenance tougher.
|
||
|
|
+ * Do this later.
|
||
|
|
+ */
|
||
|
|
+ and $17, 255, $1 # E : 00000000000000ch
|
||
|
|
+ ins0b $17, 1, $2 # U : 000000000000ch00
|
||
|
|
+ mov $16, $0 # E : return value
|
||
|
|
+ mov $17, $8 # E : Save the ch
|
||
|
|
+ ble $18, $end # U : zero length requested?
|
||
|
|
+
|
||
|
|
+ addl $18, $16, $6 # E : max address to write to
|
||
|
|
+ or $1, $2, $17 # E : 000000000000chch
|
||
|
|
+ ins0b $1, 2, $3 # U : 0000000000ch0000
|
||
|
|
+ ins0b $1, 3, $4 # U : 00000000ch000000
|
||
|
|
+
|
||
|
|
+ or $3, $4, $3 # E : 00000000chch0000
|
||
|
|
+ ins1b $17, 4, $5 # U : 0000chch00000000
|
||
|
|
+ xor $16, $6, $1 # E : will complete write be within one quadword?
|
||
|
|
+ ins1b $17, 6, $2 # U : chch000000000000
|
||
|
|
+
|
||
|
|
+ or $17, $3, $17 # E : 00000000chchchch
|
||
|
|
+ or $2, $5, $2 # E : chchchch00000000
|
||
|
|
+ bic $1, 7, $1 # E : fit within a single quadword?
|
||
|
|
+ and $16, 7, $3 # E : Target addr misalignment
|
||
|
|
+
|
||
|
|
+ or $17, $2, $17 # E : chchchchchchchch
|
||
|
|
+ beq $1, $within_quad # U :
|
||
|
|
+ nop # E :
|
||
|
|
+ beq $3, $aligned # U : target is 0mod8
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * Target address is misaligned, and won't fit within a quadword.
|
||
|
|
+ */
|
||
|
|
+
|
||
|
|
+#ifdef pixman_error
|
||
|
|
+ /* if the addr is unaligned in multi-thread, this will cause thread
|
||
|
|
+ unsafty,so use stb to store the trailing bytes. */
|
||
|
|
+ ldl_u $4, 0($16) # L : Fetch first partial
|
||
|
|
+ mov $16, $5 # E : Save the address
|
||
|
|
+ ins3b $17, $16, $2 # U : Insert new bytes
|
||
|
|
+ subl $3, 8, $3 # E : Invert (for addressing uses)
|
||
|
|
+
|
||
|
|
+ addl $18, $3, $18 # E : $18 is new count ($3 is negative)
|
||
|
|
+ mask3b $4, $16, $4 # U : clear relevant parts of the quad
|
||
|
|
+ subl $16, $3, $16 # E : $16 is new aligned destination
|
||
|
|
+ or $2, $4, $1 # E : Final bytes
|
||
|
|
+
|
||
|
|
+ nop
|
||
|
|
+ stl_u $1,0($5) # L : Store result
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+#else
|
||
|
|
+$misaligned:
|
||
|
|
+ stb $8, 0($16)
|
||
|
|
+ subl $18, 1, $18
|
||
|
|
+ beq $18, $end
|
||
|
|
+ addl $16, 1, $16
|
||
|
|
+ and $16, 7, $3 # E : Target addr misalignment
|
||
|
|
+ bne $3, $misaligned
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$aligned:
|
||
|
|
+ /*
|
||
|
|
+ * We are now guaranteed to be quad aligned, with at least
|
||
|
|
+ * one partial quad to write.
|
||
|
|
+ */
|
||
|
|
+
|
||
|
|
+ sra $18, 3, $3 # U : Number of remaining quads to write
|
||
|
|
+ and $18, 7, $18 # E : Number of trailing bytes to write
|
||
|
|
+ mov $16, $5 # E : Save dest address
|
||
|
|
+ beq $3, $no_quad # U : tail stuff only
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * It's worth the effort to unroll this and use wh64 if possible.
|
||
|
|
+ * At this point, entry values are:
|
||
|
|
+ * $16 Current destination address
|
||
|
|
+ * $5 A copy of $16
|
||
|
|
+ * $6 The max quadword address to write to
|
||
|
|
+ * $18 Number trailer bytes
|
||
|
|
+ * $3 Number quads to write
|
||
|
|
+ */
|
||
|
|
+# and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
|
||
|
|
+ and $16, 0x1f, $2 # E : Forward work (only useful for unrolled loop)
|
||
|
|
+ subl $3, 16, $4 # E : Only try to unroll if > 128 bytes
|
||
|
|
+ subl $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
|
||
|
|
+ blt $4, $loop # U :
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * We know we've got at least 16 quads, minimum of one trip
|
||
|
|
+ * through unrolled loop. Do a quad at a time to get us 0mod64
|
||
|
|
+ * aligned.
|
||
|
|
+ */
|
||
|
|
+
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+ nop # E :
|
||
|
|
+# beq $1, $bigalign # U :
|
||
|
|
+ beq $2, $bigalign # U :
|
||
|
|
+$alignmod32:
|
||
|
|
+ stl $17, 0($5) # L :
|
||
|
|
+ subl $3, 1, $3 # E : For consistency later
|
||
|
|
+ addl $1, 8, $1 # E : Increment towards zero for alignment
|
||
|
|
+# addl $5, 8, $4 # E : Initial wh64 address (filler instruction)
|
||
|
|
+
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ addl $5, 8, $5 # E : Inc address
|
||
|
|
+ blt $1, $alignmod32 # U :
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+$bigalign:
|
||
|
|
+ ldih $1, 8($31) # big than 512KB
|
||
|
|
+ cmple $18, $1, $1
|
||
|
|
+ beq $1, $do_wh64_512
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * $3 - number quads left to go
|
||
|
|
+ * $5 - target address (aligned 0mod64)
|
||
|
|
+ * $17 - mask of stuff to store
|
||
|
|
+ * Scratch registers available: $7, $2, $4, $1
|
||
|
|
+ * We know that we'll be taking a minimum of one trip through.
|
||
|
|
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
|
||
|
|
+ * Assumes the wh64 needs to be for 2 trips through the loop in the
|
||
|
|
+ * future.The wh64 is issued on for the starting destination address for
|
||
|
|
+ * trip +2 through the loop, and if there are less than two trips left,
|
||
|
|
+ * the target address will be for the current trip. */
|
||
|
|
+
|
||
|
|
+$do_wh64:
|
||
|
|
+# wh64 ($4) # L1 : memory subsystem write hint
|
||
|
|
+ subl $3, 24, $2 # E : For determining future wh64 addresses
|
||
|
|
+ stl $17, 0($5) # L :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+# addl $5, 128, $4 # E : speculative target of next wh64
|
||
|
|
+ stl $17, 8($5) # L :
|
||
|
|
+ stl $17, 16($5) # L :
|
||
|
|
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
|
||
|
|
+
|
||
|
|
+ stl $17, 24($5) # L :
|
||
|
|
+ stl $17, 32($5) # L :
|
||
|
|
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ stl $17, 40($5) # L :
|
||
|
|
+ stl $17, 48($5) # L :
|
||
|
|
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ stl $17, 56($5) # L :
|
||
|
|
+ addl $5, 64, $5 # E :
|
||
|
|
+ subl $3, 8, $3 # E :
|
||
|
|
+ bge $2, $do_wh64 # U :
|
||
|
|
+
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ beq $3, $no_quad # U : Might have finished already
|
||
|
|
+
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ br $loop # U : Might have finished already
|
||
|
|
+
|
||
|
|
+$do_wh64_512:
|
||
|
|
+# wh64 ($4) # L1 : memory subsystem write hint
|
||
|
|
+ subl $3, 24, $2 # E : For determining future wh64 addresses
|
||
|
|
+
|
||
|
|
+ fillcs 128*1($5)
|
||
|
|
+ e_fillcs 128*5($5)
|
||
|
|
+
|
||
|
|
+# stl_nc $17, 0($5) # L :
|
||
|
|
+ stl $17, 0($5) # L :
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+# addl $5, 128, $4 # E : speculative target of next wh64
|
||
|
|
+# stl_nc $17, 8($5) # L :
|
||
|
|
+ stl $17, 8($5) # L :
|
||
|
|
+# stl_nc $17, 16($5) # L :
|
||
|
|
+ stl $17, 16($5) # L :
|
||
|
|
+ addl $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
|
||
|
|
+
|
||
|
|
+# stl_nc $17, 24($5) # L :
|
||
|
|
+ stl $17, 24($5) # L :
|
||
|
|
+# stl_nc $17, 32($5) # L :
|
||
|
|
+ stl $17, 32($5) # L :
|
||
|
|
+# sellt $2, $7, $4, $4 # E : Latency 2, extra mapping cycle
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+# stl_nc $17, 40($5) # L :
|
||
|
|
+ stl $17, 40($5) # L :
|
||
|
|
+# stl_nc $17, 48($5) # L :
|
||
|
|
+ stl $17, 48($5) # L :
|
||
|
|
+ subl $3, 16, $2 # E : Repeat the loop at least once more?
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+# stl_nc $17, 56($5) # L :
|
||
|
|
+ stl $17, 56($5) # L :
|
||
|
|
+ addl $5, 64, $5 # E :
|
||
|
|
+ subl $3, 8, $3 # E :
|
||
|
|
+ bge $2, $do_wh64_512 # U :
|
||
|
|
+
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ beq $3, $no_quad # U : Might have finished already
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+ /*
|
||
|
|
+ * Simple loop for trailing quadwords, or for small amounts
|
||
|
|
+ * of data (where we can't use an unrolled loop and wh64)
|
||
|
|
+ */
|
||
|
|
+$loop:
|
||
|
|
+ stl $17, 0($5) # L :
|
||
|
|
+ subl $3, 1, $3 # E : Decrement number quads left
|
||
|
|
+ addl $5, 8, $5 # E : Inc address
|
||
|
|
+ bne $3, $loop # U : more?
|
||
|
|
+
|
||
|
|
+$no_quad:
|
||
|
|
+ /*
|
||
|
|
+ * Write 0..7 trailing bytes.
|
||
|
|
+ */
|
||
|
|
+ nop # E :
|
||
|
|
+ beq $18, $end # U : All done?
|
||
|
|
+
|
||
|
|
+#ifndef pixman_error
|
||
|
|
+/* if the addr is unaligned in multi-thread, this will cause thread unsafty,
|
||
|
|
+ so use stb to store the trailing bytes. */
|
||
|
|
+$trailing:
|
||
|
|
+ stb $17, 0($5)
|
||
|
|
+ subl $18, 1, $18
|
||
|
|
+ beq $18, $end
|
||
|
|
+ addl $5, 1, $5
|
||
|
|
+ br $trailing
|
||
|
|
+#else
|
||
|
|
+ ldl $7, 0($5) # L :
|
||
|
|
+ mask7b $7, $6, $2 # U : Mask final quad
|
||
|
|
+
|
||
|
|
+ ins7b $17, $6, $4 # U : New bits
|
||
|
|
+ or $2, $4, $1 # E : Put it all together
|
||
|
|
+ stl $1, 0($5) # L : And back to memory
|
||
|
|
+ ret $31,($26),1 # L0 :
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+$within_quad:
|
||
|
|
+#ifdef PIXMAN_ERROR
|
||
|
|
+ /* if the addr is unaligned in multi-thread, this will cause thread
|
||
|
|
+ unsafty,so use stb to store the trailing bytes. */
|
||
|
|
+ ldl_u $1, 0($16) # L :
|
||
|
|
+ ins3b $17, $16, $2 # U : New bits
|
||
|
|
+ mask3b $1, $16, $4 # U : Clear old
|
||
|
|
+ or $2, $4, $2 # E : New result
|
||
|
|
+
|
||
|
|
+ mask3b $2, $6, $4 # U :
|
||
|
|
+ mask7b $1, $6, $2 # U :
|
||
|
|
+ or $2, $4, $1 # E :
|
||
|
|
+ stl_u $1, 0($16) # L :
|
||
|
|
+#else
|
||
|
|
+ stb $8, 0($16)
|
||
|
|
+ subl $18, 1, $18
|
||
|
|
+ beq $18, $end
|
||
|
|
+ addl $16, 1, $16
|
||
|
|
+ br $within_quad
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+$end:
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ ret $31,($26),1 # L0 :
|
||
|
|
+
|
||
|
|
+ END(memset)
|
||
|
|
+libc_hidden_builtin_def (memset)
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/rshift.S b/sysdeps/sw_64/sw6b/rshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..ec2a78b0
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/rshift.S
|
||
|
|
@@ -0,0 +1,170 @@
|
||
|
|
+ # Sw_64 __mpn_rshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 3.25 cycles/limb on the sw_64.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_rshift
|
||
|
|
+ .ent __mpn_rshift
|
||
|
|
+__mpn_rshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ ldl $4,0($17) # load first limb
|
||
|
|
+ subl $31,$19,$20
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$28 # number of limbs in first loop
|
||
|
|
+ sll $4,$20,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $28,.L0
|
||
|
|
+ subl $18,$28,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0: ldl $3,8($17)
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ srl $4,$19,$5
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ subl $28,1,$28
|
||
|
|
+ sll $3,$20,$6
|
||
|
|
+ or $3,$3,$4
|
||
|
|
+ or $5,$6,$8
|
||
|
|
+ stl $8,-8($16)
|
||
|
|
+ bne $28,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: srl $4,$19,$24
|
||
|
|
+ beq $18,.Lend
|
||
|
|
+ # warm up phase 1
|
||
|
|
+ ldl $1,8($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ ldl $2,16($17)
|
||
|
|
+ ldl $3,24($17)
|
||
|
|
+ ldl $4,32($17)
|
||
|
|
+ beq $18,.Lend1
|
||
|
|
+ # warm up phase 2
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ ldl $1,40($17)
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ ldl $2,48($17)
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ ldl $3,56($17)
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ ldl $4,64($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ beq $18,.Lend2
|
||
|
|
+ .align 4
|
||
|
|
+ # main loop
|
||
|
|
+.Loop: stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ ldl $1,72($17)
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ ldl $2,80($17)
|
||
|
|
+
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ addl $16,32,$16
|
||
|
|
+
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ ldl $3,88($17)
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ ldl $4,96($17)
|
||
|
|
+
|
||
|
|
+ addl $17,32,$17
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+ # cool down phase 2/1
|
||
|
|
+.Lend2: stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ # cool down phase 2/2
|
||
|
|
+ stl $7,32($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,40($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,48($16)
|
||
|
|
+ stl $6,56($16)
|
||
|
|
+ # cool down phase 2/3
|
||
|
|
+ stl $24,64($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ # cool down phase 1/1
|
||
|
|
+.Lend1: sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ # cool down phase 1/2
|
||
|
|
+ stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ stl $24,32($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+.Lend: stl $24,0($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_rshift
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/stxcpy.S b/sysdeps/sw_64/sw6b/stxcpy.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..cf07eb8e
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/stxcpy.S
|
||
|
|
@@ -0,0 +1,314 @@
|
||
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
||
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+/* Copy a null-terminated string from SRC to DST.
|
||
|
|
+
|
||
|
|
+ This is an internal routine used by strcpy, stpcpy, and strcat.
|
||
|
|
+ As such, it uses special linkage conventions to make implementation
|
||
|
|
+ of these public functions more efficient.
|
||
|
|
+
|
||
|
|
+ On input:
|
||
|
|
+ t9 = return address
|
||
|
|
+ a0 = DST
|
||
|
|
+ a1 = SRC
|
||
|
|
+
|
||
|
|
+ On output:
|
||
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
||
|
|
+ a0 = unaligned address of the last *word* written
|
||
|
|
+
|
||
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
||
|
|
+*/
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ .arch ev6
|
||
|
|
+ .set noat
|
||
|
|
+ .set noreorder
|
||
|
|
+
|
||
|
|
+ .text
|
||
|
|
+ .type __stxcpy, @function
|
||
|
|
+ .globl __stxcpy
|
||
|
|
+ .usepv __stxcpy, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (t9)
|
||
|
|
+
|
||
|
|
+ /* On entry to this basic block:
|
||
|
|
+ t0 == the first destination word for masking back in
|
||
|
|
+ t1 == the first source word. */
|
||
|
|
+ .align 4
|
||
|
|
+stxcpy_aligned:
|
||
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
||
|
|
+ ldi t2, -1 # E : build a mask against false zero
|
||
|
|
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
|
||
|
|
+ mask7b t1, a1, t3 # U :
|
||
|
|
+ ornot t1, t2, t2 # E : (stall)
|
||
|
|
+
|
||
|
|
+ mask3b t0, a1, t0 # U : assemble the first output word
|
||
|
|
+ cmpgeb zero, t2, t10 # E : bits set iff null found
|
||
|
|
+ or t0, t3, t1 # E : (stall)
|
||
|
|
+ bne t10, $a_eos # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* On entry to this basic block:
|
||
|
|
+ t0 == the first destination word for masking back in
|
||
|
|
+ t1 == a source word not containing a null. */
|
||
|
|
+ /* Nops here to separate store quads from load quads */
|
||
|
|
+
|
||
|
|
+$a_loop:
|
||
|
|
+ stl_u t1, 0(a0) # L :
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ ldl_u t1, 0(a1) # L : Latency=3
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ cmpgeb zero, t1, t10 # E : (3 cycle stall)
|
||
|
|
+ beq t10, $a_loop # U : (stall for t10)
|
||
|
|
+
|
||
|
|
+ /* Take care of the final (partial) word store.
|
||
|
|
+ On entry to this basic block we have:
|
||
|
|
+ t1 == the source word containing the null
|
||
|
|
+ t10 == the cmpgeb mask that found it. */
|
||
|
|
+$a_eos:
|
||
|
|
+ negl t10, t6 # E : find low bit set
|
||
|
|
+ and t10, t6, t8 # E : (stall)
|
||
|
|
+ /* For the sake of the cache, don't read a destination word
|
||
|
|
+ if we're not going to need it. */
|
||
|
|
+ and t8, 0x80, t6 # E : (stall)
|
||
|
|
+ bne t6, 1f # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* We're doing a partial word store and so need to combine
|
||
|
|
+ our source and original destination words. */
|
||
|
|
+ ldl_u t0, 0(a0) # L : Latency=3
|
||
|
|
+ subl t8, 1, t6 # E :
|
||
|
|
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
|
||
|
|
+ or t8, t6, t10 # E : (stall)
|
||
|
|
+
|
||
|
|
+ zap t0, t10, t0 # E : clear dst bytes <= null
|
||
|
|
+ or t0, t1, t1 # E : (stall)
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+1: stl_u t1, 0(a0) # L :
|
||
|
|
+ ret (t9) # L0 : Latency=3
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+__stxcpy:
|
||
|
|
+ /* Are source and destination co-aligned? */
|
||
|
|
+ xor a0, a1, t0 # E :
|
||
|
|
+ unop # E :
|
||
|
|
+ and t0, 7, t0 # E : (stall)
|
||
|
|
+ bne t0, $unaligned # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
||
|
|
+ ldl_u t1, 0(a1) # L : load first src word
|
||
|
|
+ and a0, 7, t0 # E : take care not to load a word ...
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
|
||
|
|
+
|
||
|
|
+ ldl_u t0, 0(a0) # L :
|
||
|
|
+ br stxcpy_aligned # L0 : Latency=3
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+/* The source and destination are not co-aligned. Align the destination
|
||
|
|
+ and cope. We have to be very careful about not reading too much and
|
||
|
|
+ causing a SEGV. */
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$u_head:
|
||
|
|
+ /* We know just enough now to be able to assemble the first
|
||
|
|
+ full source word. We can still find a zero at the end of it
|
||
|
|
+ that prevents us from outputting the whole thing.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == the first dest word, for masking back in, if needed else 0
|
||
|
|
+ t1 == the low bits of the first source word
|
||
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
||
|
|
+
|
||
|
|
+ ldl_u t2, 8(a1) # L :
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ ext3b t1, a1, t1 # U : (stall on a1)
|
||
|
|
+ ext7b t2, a1, t4 # U : (stall on a1)
|
||
|
|
+
|
||
|
|
+ mask3b t0, a0, t0 # U :
|
||
|
|
+ or t1, t4, t1 # E :
|
||
|
|
+ mask7b t1, a0, t1 # U : (stall on t1)
|
||
|
|
+ or t0, t1, t1 # E : (stall on t1)
|
||
|
|
+
|
||
|
|
+ or t1, t6, t6 # E :
|
||
|
|
+ cmpgeb zero, t6, t10 # E : (stall)
|
||
|
|
+ ldi t6, -1 # E : for masking just below
|
||
|
|
+ bne t10, $u_final # U : (stall)
|
||
|
|
+
|
||
|
|
+ mask3b t6, a1, t6 # U : mask out the bits we have
|
||
|
|
+ or t6, t2, t2 # E : already extracted before (stall)
|
||
|
|
+ cmpgeb zero, t2, t10 # E : testing eos (stall)
|
||
|
|
+ bne t10, $u_late_head_exit # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
||
|
|
+ of and we can set up to enter the main loop. */
|
||
|
|
+
|
||
|
|
+ stl_u t1, 0(a0) # L : store first output word
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+ ext3b t2, a1, t0 # U : position ho-bits of lo word
|
||
|
|
+ ldl_u t2, 8(a1) # U : read next high-order source word
|
||
|
|
+
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ cmpgeb zero, t2, t10 # E : (stall for t2)
|
||
|
|
+ nop # E :
|
||
|
|
+ bne t10, $u_eos # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
||
|
|
+ the loop is structured to detect zeros in aligned source words.
|
||
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
||
|
|
+ iteration out into the head and half into the tail, but it does
|
||
|
|
+ prevent nastiness from accumulating in the very thing we want
|
||
|
|
+ to run as fast as possible.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == the shifted high-order bits from the previous source word
|
||
|
|
+ t2 == the unshifted current source word
|
||
|
|
+
|
||
|
|
+ We further know that t2 does not contain a null terminator. */
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$u_loop:
|
||
|
|
+ ext7b t2, a1, t1 # U : extract high bits for current word
|
||
|
|
+ addl a1, 8, a1 # E : (stall)
|
||
|
|
+ ext3b t2, a1, t3 # U : extract low bits for next time (stall)
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+
|
||
|
|
+ or t0, t1, t1 # E : current dst word now complete
|
||
|
|
+ ldl_u t2, 0(a1) # L : Latency=3 load high word for next time
|
||
|
|
+ stl_u t1, -8(a0) # L : save the current word (stall)
|
||
|
|
+ mov t3, t0 # E :
|
||
|
|
+
|
||
|
|
+ cmpgeb zero, t2, t10 # E : test new word for eos
|
||
|
|
+ beq t10, $u_loop # U : (stall)
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
||
|
|
+ If it resides in the lower half, we have one (probably partial)
|
||
|
|
+ word to write out, and if it resides in the upper half, we
|
||
|
|
+ have one full and one partial word left to write out.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == the shifted high-order bits from the previous source word
|
||
|
|
+ t2 == the unshifted current source word. */
|
||
|
|
+$u_eos:
|
||
|
|
+ ext7b t2, a1, t1 # U :
|
||
|
|
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
|
||
|
|
+ cmpgeb zero, t1, t10 # E : is the null in this first bit? (stall)
|
||
|
|
+ bne t10, $u_final # U : (stall)
|
||
|
|
+
|
||
|
|
+$u_late_head_exit:
|
||
|
|
+ stl_u t1, 0(a0) # L : the null was in the high-order bits
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+ ext3b t2, a1, t1 # U :
|
||
|
|
+ cmpgeb zero, t1, t10 # E : (stall)
|
||
|
|
+
|
||
|
|
+ /* Take care of a final (probably partial) result word.
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t1 == assembled source word
|
||
|
|
+ t10 == cmpgeb mask that found the null. */
|
||
|
|
+$u_final:
|
||
|
|
+ negl t10, t6 # E : isolate low bit set
|
||
|
|
+ and t6, t10, t8 # E : (stall)
|
||
|
|
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
||
|
|
+ bne t6, 1f # U : (stall)
|
||
|
|
+
|
||
|
|
+ ldl_u t0, 0(a0) # E :
|
||
|
|
+ subl t8, 1, t6 # E :
|
||
|
|
+ or t6, t8, t10 # E : (stall)
|
||
|
|
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
|
||
|
|
+
|
||
|
|
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
|
||
|
|
+ or t0, t1, t1 # E : (stall)
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+1: stl_u t1, 0(a0) # L :
|
||
|
|
+ ret (t9) # L0 : Latency=3
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ /* Unaligned copy entry point. */
|
||
|
|
+ .align 4
|
||
|
|
+$unaligned:
|
||
|
|
+
|
||
|
|
+ ldl_u t1, 0(a1) # L : load first source word
|
||
|
|
+ and a0, 7, t4 # E : find dest misalignment
|
||
|
|
+ and a1, 7, t5 # E : find src misalignment
|
||
|
|
+ /* Conditionally load the first destination word and a bytemask
|
||
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
||
|
|
+ mov zero, t0 # E :
|
||
|
|
+
|
||
|
|
+ mov zero, t6 # E :
|
||
|
|
+ beq t4, 1f # U :
|
||
|
|
+ ldl_u t0, 0(a0) # L :
|
||
|
|
+ ldi t6, -1 # E :
|
||
|
|
+
|
||
|
|
+ mask3b t6, a0, t6 # U :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+1:
|
||
|
|
+ subl a1, t4, a1 # E : sub dest misalignment from src addr
|
||
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
||
|
|
+ extra startup checks to avoid SEGV. */
|
||
|
|
+ cmplt t4, t5, t8 # E :
|
||
|
|
+ beq t8, $u_head # U :
|
||
|
|
+ ldi t2, -1 # E : mask out leading garbage in source
|
||
|
|
+
|
||
|
|
+ mask7b t2, t5, t2 # U :
|
||
|
|
+ ornot t1, t2, t3 # E : (stall)
|
||
|
|
+ cmpgeb zero, t3, t10 # E : is there a zero? (stall)
|
||
|
|
+ beq t10, $u_head # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* At this point we've found a zero in the first partial word of
|
||
|
|
+ the source. We need to isolate the valid source data and mask
|
||
|
|
+ it into the original destination data. (Incidentally, we know
|
||
|
|
+ that we'll need at least one byte of that original dest word.) */
|
||
|
|
+
|
||
|
|
+ ldl_u t0, 0(a0) # L :
|
||
|
|
+ negl t10, t6 # E : build bitmask of bytes <= zero
|
||
|
|
+ and t6, t10, t8 # E : (stall)
|
||
|
|
+ and a1, 7, t5 # E :
|
||
|
|
+
|
||
|
|
+ subl t8, 1, t6 # E :
|
||
|
|
+ or t6, t8, t10 # E : (stall)
|
||
|
|
+ srl t8, t5, t8 # U : adjust final null return value
|
||
|
|
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
|
||
|
|
+
|
||
|
|
+ and t1, t2, t1 # E : to source validity mask
|
||
|
|
+ ext3b t2, a1, t2 # U :
|
||
|
|
+ ext3b t1, a1, t1 # U : (stall)
|
||
|
|
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
|
||
|
|
+
|
||
|
|
+ or t0, t1, t1 # e1 : and put it there
|
||
|
|
+ stl_u t1, 0(a0) # .. e0 : (stall)
|
||
|
|
+ ret (t9) # e1 :
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/stxncpy.S b/sysdeps/sw_64/sw6b/stxncpy.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..c47029ea
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/stxncpy.S
|
||
|
|
@@ -0,0 +1,392 @@
|
||
|
|
+/* Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ Contributed by Richard Henderson (rth@tamu.edu)
|
||
|
|
+ SW6 optimized by Rick Gorton <rick.gorton@sw_64-processor.com>.
|
||
|
|
+ This file is part of the GNU C Library.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
|
+ License as published by the Free Software Foundation; either
|
||
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
|
+
|
||
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
|
+ Lesser General Public License for more details.
|
||
|
|
+
|
||
|
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
|
+ License along with the GNU C Library. If not, see
|
||
|
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
|
+
|
||
|
|
+/* Copy no more than COUNT bytes of the null-terminated string from
|
||
|
|
+ SRC to DST.
|
||
|
|
+
|
||
|
|
+ This is an internal routine used by strncpy, stpncpy, and strncat.
|
||
|
|
+ As such, it uses special linkage conventions to make implementation
|
||
|
|
+ of these public functions more efficient.
|
||
|
|
+
|
||
|
|
+ On input:
|
||
|
|
+ t9 = return address
|
||
|
|
+ a0 = DST
|
||
|
|
+ a1 = SRC
|
||
|
|
+ a2 = COUNT
|
||
|
|
+
|
||
|
|
+ Furthermore, COUNT may not be zero.
|
||
|
|
+
|
||
|
|
+ On output:
|
||
|
|
+ t0 = last word written
|
||
|
|
+ t8 = bitmask (with one bit set) indicating the last byte written
|
||
|
|
+ t10 = bitmask (with one bit set) indicating the byte position of
|
||
|
|
+ the end of the range specified by COUNT
|
||
|
|
+ a0 = unaligned address of the last *word* written
|
||
|
|
+ a2 = the number of full words left in COUNT
|
||
|
|
+
|
||
|
|
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
|
||
|
|
+*/
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ .arch ev6
|
||
|
|
+ .set noat
|
||
|
|
+ .set noreorder
|
||
|
|
+
|
||
|
|
+ .text
|
||
|
|
+ .type __stxncpy, @function
|
||
|
|
+ .globl __stxncpy
|
||
|
|
+ .usepv __stxncpy, no
|
||
|
|
+
|
||
|
|
+ cfi_startproc
|
||
|
|
+ cfi_return_column (t9)
|
||
|
|
+
|
||
|
|
+ /* On entry to this basic block:
|
||
|
|
+ t0 == the first destination word for masking back in
|
||
|
|
+ t1 == the first source word. */
|
||
|
|
+ .align 4
|
||
|
|
+stxncpy_aligned:
|
||
|
|
+ /* Create the 1st output word and detect 0's in the 1st input word. */
|
||
|
|
+ ldi t2, -1 # E : build a mask against false zero
|
||
|
|
+ mask7b t2, a1, t2 # U : detection in the src word (stall)
|
||
|
|
+ mask7b t1, a1, t3 # U :
|
||
|
|
+ ornot t1, t2, t2 # E : (stall)
|
||
|
|
+
|
||
|
|
+ mask3b t0, a1, t0 # U : assemble the first output word
|
||
|
|
+ cmpgeb zero, t2, t7 # E : bits set iff null found
|
||
|
|
+ or t0, t3, t0 # E : (stall)
|
||
|
|
+ beq a2, $a_eoc # U :
|
||
|
|
+
|
||
|
|
+ bne t7, $a_eos # U :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ /* On entry to this basic block:
|
||
|
|
+ t0 == a source word not containing a null. */
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * nops here to:
|
||
|
|
+ * separate store quads from load quads
|
||
|
|
+ * limit of 1 bcond/quad to permit training
|
||
|
|
+ */
|
||
|
|
+$a_loop:
|
||
|
|
+ stl_u t0, 0(a0) # L :
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+ subl a2, 1, a2 # E :
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ ldl_u t0, 0(a1) # L :
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ cmpgeb zero, t0, t7 # E :
|
||
|
|
+ beq a2, $a_eoc # U :
|
||
|
|
+
|
||
|
|
+ beq t7, $a_loop # U :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ /* Take care of the final (partial) word store. At this point
|
||
|
|
+ the end-of-count bit is set in t7 iff it applies.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block we have:
|
||
|
|
+ t0 == the source word containing the null
|
||
|
|
+ t7 == the cmpgeb mask that found it. */
|
||
|
|
+$a_eos:
|
||
|
|
+ negl t7, t8 # E : find low bit set
|
||
|
|
+ and t7, t8, t8 # E : (stall)
|
||
|
|
+ /* For the sake of the cache, don't read a destination word
|
||
|
|
+ if we're not going to need it. */
|
||
|
|
+ and t8, 0x80, t6 # E : (stall)
|
||
|
|
+ bne t6, 1f # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* We're doing a partial word store and so need to combine
|
||
|
|
+ our source and original destination words. */
|
||
|
|
+ ldl_u t1, 0(a0) # L :
|
||
|
|
+ subl t8, 1, t6 # E :
|
||
|
|
+ or t8, t6, t7 # E : (stall)
|
||
|
|
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
|
||
|
|
+
|
||
|
|
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
|
||
|
|
+ or t0, t1, t0 # e1 : (stall)
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+1: stl_u t0, 0(a0) # L :
|
||
|
|
+ ret (t9) # L0 : Latency=3
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ /* Add the end-of-count bit to the eos detection bitmask. */
|
||
|
|
+$a_eoc:
|
||
|
|
+ or t10, t7, t7 # E :
|
||
|
|
+ br $a_eos # L0 : Latency=3
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+__stxncpy:
|
||
|
|
+ /* Are source and destination co-aligned? */
|
||
|
|
+ ldi t2, -1 # E :
|
||
|
|
+ xor a0, a1, t1 # E :
|
||
|
|
+ and a0, 7, t0 # E : find dest misalignment
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ srl t2, 1, t2 # U :
|
||
|
|
+ and t1, 7, t1 # E :
|
||
|
|
+ sellt a2, t2, a2, a2 # E : bound count to LONG_MAX (stall)
|
||
|
|
+ nop # E :
|
||
|
|
+
|
||
|
|
+ addl a2, t0, a2 # E : bias count by dest misalignment
|
||
|
|
+ subl a2, 1, a2 # E : (stall)
|
||
|
|
+ and a2, 7, t2 # E : (stall)
|
||
|
|
+ ldi t10, 1 # E :
|
||
|
|
+
|
||
|
|
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
|
||
|
|
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
|
||
|
|
+ nop # E :
|
||
|
|
+ bne t1, $unaligned # U : (stall)
|
||
|
|
+
|
||
|
|
+ /* We are co-aligned; take care of a partial first word. */
|
||
|
|
+ ldl_u t1, 0(a1) # L : load first src word
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
|
||
|
|
+ ldl_u t0, 0(a0) # L :
|
||
|
|
+
|
||
|
|
+ br stxncpy_aligned # U :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+/* The source and destination are not co-aligned. Align the destination
|
||
|
|
+ and cope. We have to be very careful about not reading too much and
|
||
|
|
+ causing a SEGV. */
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$u_head:
|
||
|
|
+ /* We know just enough now to be able to assemble the first
|
||
|
|
+ full source word. We can still find a zero at the end of it
|
||
|
|
+ that prevents us from outputting the whole thing.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == the first dest word, unmasked
|
||
|
|
+ t1 == the shifted low bits of the first source word
|
||
|
|
+ t6 == bytemask that is -1 in dest word bytes */
|
||
|
|
+
|
||
|
|
+ ldl_u t2, 8(a1) # L : Latency=3 load second src word
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+ mask3b t0, a0, t0 # U : mask trailing garbage in dst
|
||
|
|
+ ext7b t2, a1, t4 # U : (3 cycle stall on t2)
|
||
|
|
+
|
||
|
|
+ or t1, t4, t1 # E : first aligned src word complete (stall)
|
||
|
|
+ mask7b t1, a0, t1 # U : mask leading garbage in src (stall)
|
||
|
|
+ or t0, t1, t0 # E : first output word complete (stall)
|
||
|
|
+ or t0, t6, t6 # E : mask original data for zero test (stall)
|
||
|
|
+
|
||
|
|
+ cmpgeb zero, t6, t7 # E :
|
||
|
|
+ beq a2, $u_eocfin # U :
|
||
|
|
+ ldi t6, -1 # E :
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+ bne t7, $u_final # U :
|
||
|
|
+ mask3b t6, a1, t6 # U : mask out bits already seen
|
||
|
|
+ stl_u t0, 0(a0) # L : store first output word
|
||
|
|
+ or t6, t2, t2 # E :
|
||
|
|
+
|
||
|
|
+ cmpgeb zero, t2, t7 # E : find nulls in second partial
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+ subl a2, 1, a2 # E :
|
||
|
|
+ bne t7, $u_late_head_exit # U :
|
||
|
|
+
|
||
|
|
+ /* Finally, we've got all the stupid leading edge cases taken care
|
||
|
|
+ of and we can set up to enter the main loop. */
|
||
|
|
+ ext3b t2, a1, t1 # U : position hi-bits of lo word
|
||
|
|
+ beq a2, $u_eoc # U :
|
||
|
|
+ ldl_u t2, 8(a1) # L : read next high-order source word
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+
|
||
|
|
+ ext7b t2, a1, t0 # U : position lo-bits of hi word (stall)
|
||
|
|
+ cmpgeb zero, t2, t7 # E :
|
||
|
|
+ nop
|
||
|
|
+ bne t7, $u_eos # U :
|
||
|
|
+
|
||
|
|
+ /* Unaligned copy main loop. In order to avoid reading too much,
|
||
|
|
+ the loop is structured to detect zeros in aligned source words.
|
||
|
|
+ This has, unfortunately, effectively pulled half of a loop
|
||
|
|
+ iteration out into the head and half into the tail, but it does
|
||
|
|
+ prevent nastiness from accumulating in the very thing we want
|
||
|
|
+ to run as fast as possible.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == the shifted low-order bits from the current source word
|
||
|
|
+ t1 == the shifted high-order bits from the previous source word
|
||
|
|
+ t2 == the unshifted current source word
|
||
|
|
+
|
||
|
|
+ We further know that t2 does not contain a null terminator. */
|
||
|
|
+
|
||
|
|
+ .align 4
|
||
|
|
+$u_loop:
|
||
|
|
+ or t0, t1, t0 # E : current dst word now complete
|
||
|
|
+ subl a2, 1, a2 # E : decrement word count
|
||
|
|
+ ext3b t2, a1, t1 # U : extract high bits for next time
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+
|
||
|
|
+ stl_u t0, -8(a0) # L : save the current word
|
||
|
|
+ beq a2, $u_eoc # U :
|
||
|
|
+ ldl_u t2, 8(a1) # L : Latency=3 load high word for next time
|
||
|
|
+ addl a1, 8, a1 # E :
|
||
|
|
+
|
||
|
|
+ ext7b t2, a1, t0 # U : extract low bits (2 cycle stall)
|
||
|
|
+ cmpgeb zero, t2, t7 # E : test new word for eos
|
||
|
|
+ nop
|
||
|
|
+ beq t7, $u_loop # U :
|
||
|
|
+
|
||
|
|
+ /* We've found a zero somewhere in the source word we just read.
|
||
|
|
+ If it resides in the lower half, we have one (probably partial)
|
||
|
|
+ word to write out, and if it resides in the upper half, we
|
||
|
|
+ have one full and one partial word left to write out.
|
||
|
|
+
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == the shifted low-order bits from the current source word
|
||
|
|
+ t1 == the shifted high-order bits from the previous source word
|
||
|
|
+ t2 == the unshifted current source word. */
|
||
|
|
+$u_eos:
|
||
|
|
+ or t0, t1, t0 # E : first (partial) source word complete
|
||
|
|
+ nop
|
||
|
|
+ cmpgeb zero, t0, t7 # E : is the null in this first bit? (stall)
|
||
|
|
+ bne t7, $u_final # U : (stall)
|
||
|
|
+
|
||
|
|
+ stl_u t0, 0(a0) # L : the null was in the high-order bits
|
||
|
|
+ addl a0, 8, a0 # E :
|
||
|
|
+ subl a2, 1, a2 # E :
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+$u_late_head_exit:
|
||
|
|
+ ext3b t2, a1, t0 # U :
|
||
|
|
+ cmpgeb zero, t0, t7 # E :
|
||
|
|
+ or t7, t10, t6 # E : (stall)
|
||
|
|
+ seleq a2, t6, t7, t7 # E : Latency=2, extra map slot (stall)
|
||
|
|
+
|
||
|
|
+ /* Take care of a final (probably partial) result word.
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t0 == assembled source word
|
||
|
|
+ t7 == cmpgeb mask that found the null. */
|
||
|
|
+$u_final:
|
||
|
|
+ negl t7, t6 # E : isolate low bit set
|
||
|
|
+ and t6, t7, t8 # E : (stall)
|
||
|
|
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
|
||
|
|
+ bne t6, 1f # U : (stall)
|
||
|
|
+
|
||
|
|
+ ldl_u t1, 0(a0) # L :
|
||
|
|
+ subl t8, 1, t6 # E :
|
||
|
|
+ or t6, t8, t7 # E : (stall)
|
||
|
|
+ zapnot t0, t7, t0 # U : kill source bytes > null
|
||
|
|
+
|
||
|
|
+ zap t1, t7, t1 # U : kill dest bytes <= null
|
||
|
|
+ or t0, t1, t0 # E : (stall)
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+
|
||
|
|
+1: stl_u t0, 0(a0) # L :
|
||
|
|
+ ret (t9) # L0 : Latency=3
|
||
|
|
+
|
||
|
|
+ /* Got to end-of-count before end of string.
|
||
|
|
+ On entry to this basic block:
|
||
|
|
+ t1 == the shifted high-order bits from the previous source word */
|
||
|
|
+$u_eoc:
|
||
|
|
+ and a1, 7, t6 # E :
|
||
|
|
+ sll t10, t6, t6 # U : (stall)
|
||
|
|
+ and t6, 0xff, t6 # E : (stall)
|
||
|
|
+ bne t6, 1f # U : (stall)
|
||
|
|
+
|
||
|
|
+ ldl_u t2, 8(a1) # L : load final src word
|
||
|
|
+ nop
|
||
|
|
+ ext7b t2, a1, t0 # U : extract low bits for last word (stall)
|
||
|
|
+ or t1, t0, t1 # E : (stall)
|
||
|
|
+
|
||
|
|
+1: cmpgeb zero, t1, t7 # E :
|
||
|
|
+ mov t1, t0
|
||
|
|
+
|
||
|
|
+$u_eocfin: # end-of-count, final word
|
||
|
|
+ or t10, t7, t7 # E :
|
||
|
|
+ br $u_final # L0 : Latency=3
|
||
|
|
+
|
||
|
|
+ /* Unaligned copy entry point. */
|
||
|
|
+ .align 4
|
||
|
|
+$unaligned:
|
||
|
|
+
|
||
|
|
+ ldl_u t1, 0(a1) # L : load first source word
|
||
|
|
+ and a0, 7, t4 # E : find dest misalignment
|
||
|
|
+ and a1, 7, t5 # E : find src misalignment
|
||
|
|
+ /* Conditionally load the first destination word and a bytemask
|
||
|
|
+ with 0xff indicating that the destination byte is sacrosanct. */
|
||
|
|
+ mov zero, t0 # E :
|
||
|
|
+
|
||
|
|
+ mov zero, t6 # E :
|
||
|
|
+ beq t4, 1f # U :
|
||
|
|
+ ldl_u t0, 0(a0) # L :
|
||
|
|
+ ldi t6, -1 # E :
|
||
|
|
+
|
||
|
|
+ mask3b t6, a0, t6 # U :
|
||
|
|
+ nop
|
||
|
|
+ nop
|
||
|
|
+1: subl a1, t4, a1 # E : sub dest misalignment from src addr
|
||
|
|
+
|
||
|
|
+ /* If source misalignment is larger than dest misalignment, we need
|
||
|
|
+ extra startup checks to avoid SEGV. */
|
||
|
|
+
|
||
|
|
+ cmplt t4, t5, t8 # E :
|
||
|
|
+ ext3b t1, a1, t1 # U : shift src into place
|
||
|
|
+ ldi t2, -1 # E : for creating masks later
|
||
|
|
+ beq t8, $u_head # U : (stall)
|
||
|
|
+
|
||
|
|
+ mask7b t2, t5, t2 # U : begin src byte validity mask
|
||
|
|
+ cmpgeb zero, t1, t7 # E : is there a zero?
|
||
|
|
+ ext3b t2, a1, t2 # U :
|
||
|
|
+ or t7, t10, t5 # E : test for end-of-count too
|
||
|
|
+
|
||
|
|
+ cmpgeb zero, t2, t3 # E :
|
||
|
|
+ seleq a2, t5, t7, t7 # E : Latency=2, extra map slot
|
||
|
|
+ nop # E : keep with seleq
|
||
|
|
+ andnot t7, t3, t7 # E : (stall)
|
||
|
|
+
|
||
|
|
+ beq t7, $u_head # U :
|
||
|
|
+ /* At this point we've found a zero in the first partial word of
|
||
|
|
+ the source. We need to isolate the valid source data and mask
|
||
|
|
+ it into the original destination data. (Incidentally, we know
|
||
|
|
+ that we'll need at least one byte of that original dest word.) */
|
||
|
|
+ ldl_u t0, 0(a0) # L :
|
||
|
|
+ negl t7, t6 # E : build bitmask of bytes <= zero
|
||
|
|
+ mask7b t1, t4, t1 # U :
|
||
|
|
+
|
||
|
|
+ and t6, t7, t8 # E :
|
||
|
|
+ subl t8, 1, t6 # E : (stall)
|
||
|
|
+ or t6, t8, t7 # E : (stall)
|
||
|
|
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
|
||
|
|
+
|
||
|
|
+ zapnot t1, t7, t1 # U : to source validity mask
|
||
|
|
+ andnot t0, t2, t0 # E : zero place for source to reside
|
||
|
|
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
|
||
|
|
+ stl_u t0, 0(a0) # L : (stall)
|
||
|
|
+
|
||
|
|
+ ret (t9) # L0 : Latency=3
|
||
|
|
+
|
||
|
|
+ cfi_endproc
|
||
|
|
diff --git a/sysdeps/sw_64/sw6b/sub_n.S b/sysdeps/sw_64/sw6b/sub_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..95c257f7
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw6b/sub_n.S
|
||
|
|
@@ -0,0 +1,147 @@
|
||
|
|
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
|
||
|
|
+ # store difference in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_sub_n
|
||
|
|
+ .ent __mpn_sub_n
|
||
|
|
+__mpn_sub_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ or $31,$31,$25 # clear cy
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
|
||
|
|
+ # Start software pipeline for 1st loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ subl $4,$0,$20 # 1st main sub
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last sub
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ subl $5,$28,$21 # 2nd main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
|
||
|
|
+ # 1st loop handles groups of 4 limbs in a software pipeline
|
||
|
|
+ .align 4
|
||
|
|
+.Loop: cmpult $5,$21,$25 # compute cy from last add
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $6,$28,$22 # 3rd main sub
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $6,$22,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ subl $7,$28,$23 # 4th main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $7,$23,$25 # compute cy from last add
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ subl $4,$28,$20 # 1st main sub
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+ subl $5,$28,$21 # 2nd main sub
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ bge $19,.Loop
|
||
|
|
+ # Finish software pipeline for 1st loop
|
||
|
|
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ subl $6,$28,$22 # 3rd main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $6,$22,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ subl $7,$28,$23 # 4th main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $7,$23,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+.Lend2: addl $19,4,$19 # restore loop cnt
|
||
|
|
+ beq $19,.Lret
|
||
|
|
+ # Start software pipeline for 2nd loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ beq $19,.Lend0
|
||
|
|
+ # 2nd loop handles remaining 1-3 limbs
|
||
|
|
+ .align 4
|
||
|
|
+.Loop0: addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $0,8($18)
|
||
|
|
+ subl $4,$28,$20 # main sub
|
||
|
|
+ ldl $1,8($17)
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ subl $19,1,$19 # decr loop cnt
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ or $1,$31,$4
|
||
|
|
+ bne $19,.Loop0
|
||
|
|
+.Lend0: addl $0,$25,$28 # cy add
|
||
|
|
+ subl $4,$28,$20 # main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+
|
||
|
|
+.Lret: or $25,$31,$0 # return cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_sub_n
|
||
|
|
diff --git a/sysdeps/sw_64/sw8a/add_n.S b/sysdeps/sw_64/sw8a/add_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..86e9f9ae
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw8a/add_n.S
|
||
|
|
@@ -0,0 +1,146 @@
|
||
|
|
+ # Sw_64 __mpn_add_n -- Add two limb vectors of the same length > 0 and
|
||
|
|
+ # store sum in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_add_n
|
||
|
|
+ .ent __mpn_add_n
|
||
|
|
+__mpn_add_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ or $31,$31,$25 # clear cy
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
|
||
|
|
+ # Start software pipeline for 1st loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ addl $0,$4,$20 # 1st main add
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $20,$0,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ addl $5,$28,$21 # 2nd main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
|
||
|
|
+ # 1st loop handles groups of 4 limbs in a software pipeline
|
||
|
|
+ .align 4
|
||
|
|
+.Loop: cmpult $21,$28,$25 # compute cy from last add
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ addl $28,$6,$22 # 3rd main add
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $22,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ addl $28,$7,$23 # 4th main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $23,$28,$25 # compute cy from last add
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ addl $4,$28,$20 # 1st main add
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+ addl $5,$28,$21 # 2nd main add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ bge $19,.Loop
|
||
|
|
+ # Finish software pipeline for 1st loop
|
||
|
|
+.Lend1: cmpult $21,$28,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ addl $28,$6,$22 # 3rd main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $22,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ addl $28,$7,$23 # 4th main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $23,$28,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+.Lend2: addl $19,4,$19 # restore loop cnt
|
||
|
|
+ beq $19,.Lret
|
||
|
|
+ # Start software pipeline for 2nd loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ beq $19,.Lend0
|
||
|
|
+ # 2nd loop handles remaining 1-3 limbs
|
||
|
|
+ .align 4
|
||
|
|
+.Loop0: addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $0,8($18)
|
||
|
|
+ addl $4,$28,$20 # main add
|
||
|
|
+ ldl $4,8($17)
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ subl $19,1,$19 # decr loop cnt
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ bne $19,.Loop0
|
||
|
|
+.Lend0: addl $0,$25,$28 # cy add
|
||
|
|
+ addl $4,$28,$20 # main add
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $20,$28,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+
|
||
|
|
+.Lret: or $25,$31,$0 # return cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_add_n
|
||
|
|
diff --git a/sysdeps/sw_64/sw8a/addmul_1.S b/sysdeps/sw_64/sw8a/addmul_1.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..95487c26
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw8a/addmul_1.S
|
||
|
|
@@ -0,0 +1,475 @@
|
||
|
|
+ # Sw_64 sw6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
|
||
|
|
+ # the result to a second limb vector.
|
||
|
|
+ #
|
||
|
|
+ # Copyright (C) 2000-2023 Free Software Foundation, Inc.
|
||
|
|
+ #
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+ #
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published
|
||
|
|
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
|
||
|
|
+ # your option) any later version.
|
||
|
|
+ #
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+ #
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # size $18
|
||
|
|
+ # s2_limb $19
|
||
|
|
+ #
|
||
|
|
+ #
|
||
|
|
+ # This code was written in close cooperation with pipeline expert
|
||
|
|
+ # . Any errors are tege's fault, though.
|
||
|
|
+ #
|
||
|
|
+ # Register usages for unrolled loop:
|
||
|
|
+ # 0-3 mul's
|
||
|
|
+ # 4-7 acc's
|
||
|
|
+ # 8-15 mul results
|
||
|
|
+ # 20,21 carry's
|
||
|
|
+ # 22,23 save for stores
|
||
|
|
+ #
|
||
|
|
+ # Sustains 8 mul-fadds in 29 cycles in the unrolled inner loop.
|
||
|
|
+ #
|
||
|
|
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
|
||
|
|
+ # them, so that further disturbance to the schedule is damped.
|
||
|
|
+ #
|
||
|
|
+ # We couldn't pair the loads, because the entangled schedule of the
|
||
|
|
+ # carry's has to happen on one side {0} of the machine. Note, the total
|
||
|
|
+ # use of U0, and the total use of L0 (after attending to the stores).
|
||
|
|
+ # which is part of the reason why....
|
||
|
|
+ #
|
||
|
|
+ # This is a great schedule for the d_cache, a poor schedule for the
|
||
|
|
+ # b_cache. The lockup on U0 means that any stall can't be recovered
|
||
|
|
+ # from. Consider a ldl in L1. say that load gets stalled because it
|
||
|
|
+ # collides with a fill from the b_Cache. On the next cycle, this load
|
||
|
|
+ # gets priority. If first looks at L0, and goes there. The instruction
|
||
|
|
+ # we intended for L0 gets to look at L1, which is NOT where we want
|
||
|
|
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
|
||
|
|
+ # causes a further instruction to stall.
|
||
|
|
+ #
|
||
|
|
+ # So for b_cache, we're likely going to want to put one or more cycles
|
||
|
|
+ # back into the code! And, of course, put in prefetches. For the
|
||
|
|
+ # accumulator, flds, intent to modify. For the fmuldiplier, you might
|
||
|
|
+ # want ldl, evict next, if you're not wanting to use it again soon. Use
|
||
|
|
+ # 256 ahead of present pointer value. At a place where we have an mt
|
||
|
|
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
|
||
|
|
+ # prefetch into lower.
|
||
|
|
+ #
|
||
|
|
+ # Note, the usage of physical registers per cycle is smoothed off, as
|
||
|
|
+ # much as possible.
|
||
|
|
+ #
|
||
|
|
+ # Note, the ldl's and stl's are at the end of the quadpacks. note, we'd
|
||
|
|
+ # like not to have a ldl or stl to preceded a conditional branch in a
|
||
|
|
+ # quadpack. The conditional branch moves the retire pointer one cycle
|
||
|
|
+ # later.
|
||
|
|
+ #
|
||
|
|
+ # Optimization notes:
|
||
|
|
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
|
||
|
|
+ # Reserved regs: $29 $30 $31
|
||
|
|
+ # Free caller-saves regs in unrolled code: $24 $25 $28
|
||
|
|
+ # We should swap some of the callee-saves regs for some of the free
|
||
|
|
+ # caller-saves regs, saving some overhead cycles.
|
||
|
|
+ # Most importantly, we should write fast code for the 0-7 case.
|
||
|
|
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
|
||
|
|
+ # on the 21264. Should not be hard, if we write specialized code for
|
||
|
|
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
|
||
|
|
+ # need a jump table indexed by the low 3 bits of the count argument.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+ .text
|
||
|
|
+
|
||
|
|
+ .globl __mpn_addmul_1
|
||
|
|
+ .ent __mpn_addmul_1
|
||
|
|
+__mpn_addmul_1:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+ .prologue 0
|
||
|
|
+
|
||
|
|
+ cmpult $18, 8, $1
|
||
|
|
+ beq $1, $Large
|
||
|
|
+
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2, $19, $0 # $0 = prod_high
|
||
|
|
+ beq $18, $Lend0b # jump if size was == 1
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $4
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ beq $18, $Lend0a # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$Loop0: mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $18, 1, $18 # size--
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ bne $18, $Loop0
|
||
|
|
+$Lend0a:
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ addl $4, $0, $0 # cy_limb = prod_high + cy
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+$Lend0b:
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $0, $5, $0
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+
|
||
|
|
+$Large:
|
||
|
|
+ ldi $30, -240($30)
|
||
|
|
+ stl $9, 8($30)
|
||
|
|
+ stl $10, 16($30)
|
||
|
|
+ stl $11, 24($30)
|
||
|
|
+ stl $12, 32($30)
|
||
|
|
+ stl $13, 40($30)
|
||
|
|
+ stl $14, 48($30)
|
||
|
|
+ stl $15, 56($30)
|
||
|
|
+
|
||
|
|
+ and $18, 7, $20 # count for the first loop, 0-7
|
||
|
|
+ srl $18, 3, $18 # count for unrolled loop
|
||
|
|
+ bis $31, $31, $0
|
||
|
|
+ beq $20, $Lunroll
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ umulh $2, $19, $0 # $0 = prod_high
|
||
|
|
+ beq $20, $Lend1b # jump if size was == 1
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $4
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ beq $20, $Lend1a # jump if size was == 2
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+$Loop1: mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ subl $20, 1, $20 # size--
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ ldl $2, 0($17) # $2 = s1_limb
|
||
|
|
+ addl $17, 8, $17 # s1_ptr++
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ bne $20, $Loop1
|
||
|
|
+
|
||
|
|
+$Lend1a:
|
||
|
|
+ mull $2, $19, $3 # $3 = prod_low
|
||
|
|
+ ldl $5, 0($16) # $5 = *res_ptr
|
||
|
|
+ addl $4, $0, $0 # cy_limb = cy_limb + 'cy'
|
||
|
|
+ umulh $2, $19, $4 # $4 = cy_limb
|
||
|
|
+ addl $3, $0, $3 # $3 = cy_limb + prod_low
|
||
|
|
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $5, $0, $0 # combine carries
|
||
|
|
+ addl $4, $0, $0 # cy_limb = prod_high + cy
|
||
|
|
+ br $31, $Lunroll
|
||
|
|
+$Lend1b:
|
||
|
|
+ addl $5, $3, $3
|
||
|
|
+ cmpult $3, $5, $5
|
||
|
|
+ stl $3, 0($16)
|
||
|
|
+ addl $16, 8, $16 # res_ptr++
|
||
|
|
+ addl $0, $5, $0
|
||
|
|
+
|
||
|
|
+$Lunroll:
|
||
|
|
+ ldi $17, -16($17) # L1 bookkeeping
|
||
|
|
+ ldi $16, -16($16) # L1 bookkeeping
|
||
|
|
+ bis $0, $31, $12
|
||
|
|
+
|
||
|
|
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
|
||
|
|
+
|
||
|
|
+ ldl $2, 16($17) # L1
|
||
|
|
+ ldl $3, 24($17) # L1
|
||
|
|
+ ldi $18, -1($18) # L1 bookkeeping
|
||
|
|
+ ldl $6, 16($16) # L1
|
||
|
|
+ ldl $7, 24($16) # L1
|
||
|
|
+ ldl $0, 32($17) # L1
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ ldl $1, 40($17) # L1
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ ldi $17, 64($17) # L1 bookkeeping
|
||
|
|
+ ldl $4, 32($16) # L1
|
||
|
|
+ ldl $5, 40($16) # L1
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ ldl $2, -16($17) # L1
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ ldl $3, -8($17) # L1
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ ldi $16, 64($16) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, -16($16) # L1
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, -8($16) # L1
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 0($17) # L1
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 8($17) # L1
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # L0 lo + acc
|
||
|
|
+ stl $22, -48($16) # L0
|
||
|
|
+ stl $23, -40($16) # L1
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ addl $8, $21, $8 # U0 hi mul + carry
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ ble $18, $Lend # U1 bookkeeping
|
||
|
|
+
|
||
|
|
+ # ____ MAIN UNROLLED LOOP ____
|
||
|
|
+ .align 4
|
||
|
|
+$Loop:
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 0($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 8($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ ldl $2, 16($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ ldl $3, 24($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, -32($16) # L0
|
||
|
|
+ stl $23, -24($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $18, -1($18) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, 16($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, 24($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 32($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 40($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # U0 lo + acc
|
||
|
|
+ stl $22, -16($16) # L0
|
||
|
|
+ stl $23, -8($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $8, $21, $8 # L0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $17, 64($17) # L1 bookkeeping
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 32($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 40($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ ldl $2, -16($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ ldl $3, -8($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, 0($16) # L0
|
||
|
|
+ stl $23, 8($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ ldi $16, 64($16) # L1 bookkeeping
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $6, -16($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # U1 mt
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ ldl $7, -8($16) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ ldl $0, 0($17) # L1
|
||
|
|
+
|
||
|
|
+ mull $19, $2, $13 # U1
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ ldl $1, 8($17) # L1
|
||
|
|
+
|
||
|
|
+ umulh $19, $2, $14 # U1
|
||
|
|
+ addl $4, $9, $4 # L0 lo + acc
|
||
|
|
+ stl $22, -48($16) # L0
|
||
|
|
+ stl $23, -40($16) # L1
|
||
|
|
+
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ mull $19, $3, $15 # U1
|
||
|
|
+ bis $31, $31, $31 # L1 st slosh
|
||
|
|
+ addl $8, $21, $8 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ bis $31, $31, $31 # L1 mt
|
||
|
|
+ bgt $18, $Loop # U1 bookkeeping
|
||
|
|
+
|
||
|
|
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
|
||
|
|
+$Lend:
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ ldl $4, 0($16) # L1
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ ldl $5, 8($16) # L1
|
||
|
|
+ umulh $19, $3, $8 # U1
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ mull $19, $0, $9 # U1
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ umulh $19, $0, $10 # U1
|
||
|
|
+ addl $6, $13, $6 # L0 lo + acc
|
||
|
|
+ stl $22, -32($16) # L0
|
||
|
|
+ stl $23, -24($16) # L1
|
||
|
|
+ mull $19, $1, $11 # U1
|
||
|
|
+ addl $12, $21, $12 # U0 hi mul + carry
|
||
|
|
+ cmpult $6, $13, $20 # L0 lo add => carry
|
||
|
|
+ addl $6, $12, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $12, $21 # L0 hi add => carry
|
||
|
|
+ addl $14, $20, $14 # U0 hi mul + carry
|
||
|
|
+ addl $7, $15, $23 # L0 lo + acc
|
||
|
|
+ addl $14, $21, $14 # U0 hi mul + carry
|
||
|
|
+ umulh $19, $1, $12 # U1
|
||
|
|
+ cmpult $23, $15, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $14, $23 # U0 hi add => answer
|
||
|
|
+ cmpult $23, $14, $21 # L0 hi add => carry
|
||
|
|
+ addl $8, $20, $8 # U0 hi mul + carry
|
||
|
|
+ addl $4, $9, $4 # U0 lo + acc
|
||
|
|
+ stl $22, -16($16) # L0
|
||
|
|
+ stl $23, -8($16) # L1
|
||
|
|
+ bis $31, $31, $31 # L0 st slosh
|
||
|
|
+ addl $8, $21, $8 # L0 hi mul + carry
|
||
|
|
+ cmpult $4, $9, $20 # L0 lo add => carry
|
||
|
|
+ addl $4, $8, $22 # U0 hi add => answer
|
||
|
|
+ cmpult $22, $8, $21 # L0 hi add => carry
|
||
|
|
+ addl $10, $20, $10 # U0 hi mul + carry
|
||
|
|
+ addl $5, $11, $23 # L0 lo + acc
|
||
|
|
+ addl $10, $21, $10 # L0 hi mul + carry
|
||
|
|
+ cmpult $23, $11, $20 # L0 lo add => carry
|
||
|
|
+ addl $23, $10, $23 # U0 hi add => answer
|
||
|
|
+ cmpult $23, $10, $21 # L0 hi add => carry
|
||
|
|
+ addl $12, $20, $12 # U0 hi mul + carry
|
||
|
|
+ stl $22, 0($16) # L0
|
||
|
|
+ stl $23, 8($16) # L1
|
||
|
|
+ addl $12, $21, $0 # U0 hi mul + carry
|
||
|
|
+
|
||
|
|
+ ldl $9, 8($30)
|
||
|
|
+ ldl $10, 16($30)
|
||
|
|
+ ldl $11, 24($30)
|
||
|
|
+ ldl $12, 32($30)
|
||
|
|
+ ldl $13, 40($30)
|
||
|
|
+ ldl $14, 48($30)
|
||
|
|
+ ldl $15, 56($30)
|
||
|
|
+ ldi $30, 240($30)
|
||
|
|
+ ret $31, ($26), 1
|
||
|
|
+
|
||
|
|
+ .end __mpn_addmul_1
|
||
|
|
diff --git a/sysdeps/sw_64/sw8a/lshift.S b/sysdeps/sw_64/sw8a/lshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..76f1fb0e
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw8a/lshift.S
|
||
|
|
@@ -0,0 +1,172 @@
|
||
|
|
+ # Sw_64 __mpn_lshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 3.25 cycles/limb on the sw_64.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_lshift
|
||
|
|
+ .ent __mpn_lshift
|
||
|
|
+__mpn_lshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ s8addl $18,$17,$17 # make r17 point at end of s1
|
||
|
|
+ ldl $4,-8($17) # load first limb
|
||
|
|
+ subl $31,$19,$20
|
||
|
|
+ s8addl $18,$16,$16 # make r16 point at end of RES
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$28 # number of limbs in first loop
|
||
|
|
+ srl $4,$20,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $28,.L0
|
||
|
|
+ subl $18,$28,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0: ldl $3,-16($17)
|
||
|
|
+ subl $16,8,$16
|
||
|
|
+ sll $4,$19,$5
|
||
|
|
+ subl $17,8,$17
|
||
|
|
+ subl $28,1,$28
|
||
|
|
+ srl $3,$20,$6
|
||
|
|
+ or $3,$3,$4
|
||
|
|
+ or $5,$6,$8
|
||
|
|
+ stl $8,0($16)
|
||
|
|
+ bne $28,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: sll $4,$19,$24
|
||
|
|
+ beq $18,.Lend
|
||
|
|
+ # warm up phase 1
|
||
|
|
+ ldl $1,-16($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ ldl $2,-24($17)
|
||
|
|
+ ldl $3,-32($17)
|
||
|
|
+ ldl $4,-40($17)
|
||
|
|
+ beq $18,.Lend1
|
||
|
|
+ # warm up phase 2
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ ldl $1,-48($17)
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ ldl $2,-56($17)
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ ldl $3,-64($17)
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ ldl $4,-72($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ beq $18,.Lend2
|
||
|
|
+ .align 4
|
||
|
|
+ # main loop
|
||
|
|
+.Loop: stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ ldl $1,-80($17)
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ ldl $2,-88($17)
|
||
|
|
+
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ subl $16,32,$16
|
||
|
|
+
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ ldl $3,-96($17)
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ ldl $4,-104($17)
|
||
|
|
+
|
||
|
|
+ subl $17,32,$17
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+ # cool down phase 2/1
|
||
|
|
+.Lend2: stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ # cool down phase 2/2
|
||
|
|
+ stl $7,-40($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-48($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,-56($16)
|
||
|
|
+ stl $6,-64($16)
|
||
|
|
+ # cool down phase 2/3
|
||
|
|
+ stl $24,-72($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ # cool down phase 1/1
|
||
|
|
+.Lend1: srl $1,$20,$7
|
||
|
|
+ sll $1,$19,$21
|
||
|
|
+ srl $2,$20,$8
|
||
|
|
+ sll $2,$19,$22
|
||
|
|
+ srl $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ sll $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ srl $4,$20,$6
|
||
|
|
+ sll $4,$19,$24
|
||
|
|
+ # cool down phase 1/2
|
||
|
|
+ stl $7,-8($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,-16($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,-24($16)
|
||
|
|
+ stl $6,-32($16)
|
||
|
|
+ stl $24,-40($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+.Lend: stl $24,-8($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_lshift
|
||
|
|
diff --git a/sysdeps/sw_64/sw8a/rshift.S b/sysdeps/sw_64/sw8a/rshift.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..ec2a78b0
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw8a/rshift.S
|
||
|
|
@@ -0,0 +1,170 @@
|
||
|
|
+ # Sw_64 __mpn_rshift --
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1994-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr r16
|
||
|
|
+ # s1_ptr r17
|
||
|
|
+ # size r18
|
||
|
|
+ # cnt r19
|
||
|
|
+
|
||
|
|
+ # This code runs at 3.25 cycles/limb on the sw_64.
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_rshift
|
||
|
|
+ .ent __mpn_rshift
|
||
|
|
+__mpn_rshift:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ ldl $4,0($17) # load first limb
|
||
|
|
+ subl $31,$19,$20
|
||
|
|
+ subl $18,1,$18
|
||
|
|
+ and $18,4-1,$28 # number of limbs in first loop
|
||
|
|
+ sll $4,$20,$0 # compute function result
|
||
|
|
+
|
||
|
|
+ beq $28,.L0
|
||
|
|
+ subl $18,$28,$18
|
||
|
|
+
|
||
|
|
+ .align 3
|
||
|
|
+.Loop0: ldl $3,8($17)
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ srl $4,$19,$5
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ subl $28,1,$28
|
||
|
|
+ sll $3,$20,$6
|
||
|
|
+ or $3,$3,$4
|
||
|
|
+ or $5,$6,$8
|
||
|
|
+ stl $8,-8($16)
|
||
|
|
+ bne $28,.Loop0
|
||
|
|
+
|
||
|
|
+.L0: srl $4,$19,$24
|
||
|
|
+ beq $18,.Lend
|
||
|
|
+ # warm up phase 1
|
||
|
|
+ ldl $1,8($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ ldl $2,16($17)
|
||
|
|
+ ldl $3,24($17)
|
||
|
|
+ ldl $4,32($17)
|
||
|
|
+ beq $18,.Lend1
|
||
|
|
+ # warm up phase 2
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ ldl $1,40($17)
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ ldl $2,48($17)
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ ldl $3,56($17)
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ ldl $4,64($17)
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ beq $18,.Lend2
|
||
|
|
+ .align 4
|
||
|
|
+ # main loop
|
||
|
|
+.Loop: stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ subl $18,4,$18
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ ldl $1,72($17)
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ ldl $2,80($17)
|
||
|
|
+
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ unop # ldl $31,-96($17)
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ addl $16,32,$16
|
||
|
|
+
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ ldl $3,88($17)
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ ldl $4,96($17)
|
||
|
|
+
|
||
|
|
+ addl $17,32,$17
|
||
|
|
+ bne $18,.Loop
|
||
|
|
+ # cool down phase 2/1
|
||
|
|
+.Lend2: stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ # cool down phase 2/2
|
||
|
|
+ stl $7,32($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,40($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,48($16)
|
||
|
|
+ stl $6,56($16)
|
||
|
|
+ # cool down phase 2/3
|
||
|
|
+ stl $24,64($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ # cool down phase 1/1
|
||
|
|
+.Lend1: sll $1,$20,$7
|
||
|
|
+ srl $1,$19,$21
|
||
|
|
+ sll $2,$20,$8
|
||
|
|
+ srl $2,$19,$22
|
||
|
|
+ sll $3,$20,$5
|
||
|
|
+ or $7,$24,$7
|
||
|
|
+ srl $3,$19,$23
|
||
|
|
+ or $8,$21,$8
|
||
|
|
+ sll $4,$20,$6
|
||
|
|
+ srl $4,$19,$24
|
||
|
|
+ # cool down phase 1/2
|
||
|
|
+ stl $7,0($16)
|
||
|
|
+ or $5,$22,$5
|
||
|
|
+ stl $8,8($16)
|
||
|
|
+ or $6,$23,$6
|
||
|
|
+ stl $5,16($16)
|
||
|
|
+ stl $6,24($16)
|
||
|
|
+ stl $24,32($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+.Lend: stl $24,0($16)
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_rshift
|
||
|
|
diff --git a/sysdeps/sw_64/sw8a/sub_n.S b/sysdeps/sw_64/sw8a/sub_n.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..95c257f7
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/sw8a/sub_n.S
|
||
|
|
@@ -0,0 +1,147 @@
|
||
|
|
+ # Sw_64 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
|
||
|
|
+ # store difference in a third limb vector.
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1995-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+
|
||
|
|
+ # INPUT PARAMETERS
|
||
|
|
+ # res_ptr $16
|
||
|
|
+ # s1_ptr $17
|
||
|
|
+ # s2_ptr $18
|
||
|
|
+ # size $19
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+.text
|
||
|
|
+ .align 3
|
||
|
|
+ .globl __mpn_sub_n
|
||
|
|
+ .ent __mpn_sub_n
|
||
|
|
+__mpn_sub_n:
|
||
|
|
+ .frame $30,0,$26,0
|
||
|
|
+
|
||
|
|
+ or $31,$31,$25 # clear cy
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ blt $19,.Lend2 # if less than 4 limbs, goto 2nd loop
|
||
|
|
+ # Start software pipeline for 1st loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ subl $4,$0,$20 # 1st main sub
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last sub
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ subl $5,$28,$21 # 2nd main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ blt $19,.Lend1 # if less than 4 limbs remain, jump
|
||
|
|
+ # 1st loop handles groups of 4 limbs in a software pipeline
|
||
|
|
+ .align 4
|
||
|
|
+.Loop: cmpult $5,$21,$25 # compute cy from last add
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ ldl $1,8($18)
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $6,$28,$22 # 3rd main sub
|
||
|
|
+ ldl $5,8($17)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $6,$22,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ subl $7,$28,$23 # 4th main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $7,$23,$25 # compute cy from last add
|
||
|
|
+ addl $17,32,$17 # update s1_ptr
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $2,16($18)
|
||
|
|
+ subl $4,$28,$20 # 1st main sub
|
||
|
|
+ ldl $3,24($18)
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ ldl $6,-16($17)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ ldl $7,-8($17)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ subl $19,4,$19 # decr loop cnt
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ addl $1,$25,$28 # cy add
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+ subl $5,$28,$21 # 2nd main sub
|
||
|
|
+ addl $18,32,$18 # update s2_ptr
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ bge $19,.Loop
|
||
|
|
+ # Finish software pipeline for 1st loop
|
||
|
|
+.Lend1: cmpult $5,$21,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $2,$25,$28 # cy add
|
||
|
|
+ subl $6,$28,$22 # 3rd main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $6,$22,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ stl $21,8($16)
|
||
|
|
+ addl $3,$25,$28 # cy add
|
||
|
|
+ subl $7,$28,$23 # 4th main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $7,$23,$25 # compute cy from last add
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,32,$16 # update res_ptr
|
||
|
|
+ stl $22,-16($16)
|
||
|
|
+ stl $23,-8($16)
|
||
|
|
+.Lend2: addl $19,4,$19 # restore loop cnt
|
||
|
|
+ beq $19,.Lret
|
||
|
|
+ # Start software pipeline for 2nd loop
|
||
|
|
+ ldl $0,0($18)
|
||
|
|
+ ldl $4,0($17)
|
||
|
|
+ subl $19,1,$19
|
||
|
|
+ beq $19,.Lend0
|
||
|
|
+ # 2nd loop handles remaining 1-3 limbs
|
||
|
|
+ .align 4
|
||
|
|
+.Loop0: addl $0,$25,$28 # cy add
|
||
|
|
+ ldl $0,8($18)
|
||
|
|
+ subl $4,$28,$20 # main sub
|
||
|
|
+ ldl $1,8($17)
|
||
|
|
+ addl $18,8,$18
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ addl $17,8,$17
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ subl $19,1,$19 # decr loop cnt
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+ addl $16,8,$16
|
||
|
|
+ or $1,$31,$4
|
||
|
|
+ bne $19,.Loop0
|
||
|
|
+.Lend0: addl $0,$25,$28 # cy add
|
||
|
|
+ subl $4,$28,$20 # main sub
|
||
|
|
+ cmpult $28,$25,$8 # compute cy from last add
|
||
|
|
+ cmpult $4,$20,$25 # compute cy from last add
|
||
|
|
+ stl $20,0($16)
|
||
|
|
+ or $8,$25,$25 # combine cy from the two fadds
|
||
|
|
+
|
||
|
|
+.Lret: or $25,$31,$0 # return cy
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+ .end __mpn_sub_n
|
||
|
|
diff --git a/sysdeps/sw_64/udiv_qrnnd.S b/sysdeps/sw_64/udiv_qrnnd.S
|
||
|
|
new file mode 100644
|
||
|
|
index 00000000..054034cd
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/sysdeps/sw_64/udiv_qrnnd.S
|
||
|
|
@@ -0,0 +1,159 @@
|
||
|
|
+ # Sw_64 1621 __udiv_qrnnd
|
||
|
|
+
|
||
|
|
+ # Copyright (C) 1992-2023 Free Software Foundation, Inc.
|
||
|
|
+
|
||
|
|
+ # This file is part of the GNU MP Library.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
|
+ # it under the terms of the GNU Lesser General Public License as published by
|
||
|
|
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
|
||
|
|
+ # option) any later version.
|
||
|
|
+
|
||
|
|
+ # The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
|
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
|
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||
|
|
+ # License for more details.
|
||
|
|
+
|
||
|
|
+ # You should have received a copy of the GNU Lesser General Public License
|
||
|
|
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
+
|
||
|
|
+#include <sysdep.h>
|
||
|
|
+
|
||
|
|
+ .set noreorder
|
||
|
|
+ .set noat
|
||
|
|
+
|
||
|
|
+ .text
|
||
|
|
+
|
||
|
|
+LEAF(__udiv_qrnnd, 0)
|
||
|
|
+#ifdef PROF
|
||
|
|
+ ldgp gp, 0(pv)
|
||
|
|
+ ldi AT, _mcount
|
||
|
|
+ call AT, (AT), _mcount
|
||
|
|
+ .prologue 1
|
||
|
|
+#else
|
||
|
|
+ .prologue 0
|
||
|
|
+#endif
|
||
|
|
+
|
||
|
|
+#define cnt $2
|
||
|
|
+#define tmp $3
|
||
|
|
+#define rem_ptr $16
|
||
|
|
+#define n1 $17
|
||
|
|
+#define n0 $18
|
||
|
|
+#define d $19
|
||
|
|
+#define qb $20
|
||
|
|
+
|
||
|
|
+ ldi cnt,16
|
||
|
|
+ blt d,$largedivisor
|
||
|
|
+
|
||
|
|
+$loop1: cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule d,n1,qb
|
||
|
|
+ subl n1,d,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule d,n1,qb
|
||
|
|
+ subl n1,d,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule d,n1,qb
|
||
|
|
+ subl n1,d,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule d,n1,qb
|
||
|
|
+ subl n1,d,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ subl cnt,1,cnt
|
||
|
|
+ bgt cnt,$loop1
|
||
|
|
+ stl n1,0(rem_ptr)
|
||
|
|
+ bis $31,n0,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+$largedivisor:
|
||
|
|
+ and n0,1,$4
|
||
|
|
+
|
||
|
|
+ srl n0,1,n0
|
||
|
|
+ sll n1,63,tmp
|
||
|
|
+ or tmp,n0,n0
|
||
|
|
+ srl n1,1,n1
|
||
|
|
+
|
||
|
|
+ and d,1,$6
|
||
|
|
+ srl d,1,$5
|
||
|
|
+ addl $5,$6,$5
|
||
|
|
+
|
||
|
|
+$loop2: cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule $5,n1,qb
|
||
|
|
+ subl n1,$5,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule $5,n1,qb
|
||
|
|
+ subl n1,$5,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule $5,n1,qb
|
||
|
|
+ subl n1,$5,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ cmplt n0,0,tmp
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ bis n1,tmp,n1
|
||
|
|
+ addl n0,n0,n0
|
||
|
|
+ cmpule $5,n1,qb
|
||
|
|
+ subl n1,$5,tmp
|
||
|
|
+ selne qb,tmp,n1,n1
|
||
|
|
+ bis n0,qb,n0
|
||
|
|
+ subl cnt,1,cnt
|
||
|
|
+ bgt cnt,$loop2
|
||
|
|
+
|
||
|
|
+ addl n1,n1,n1
|
||
|
|
+ addl $4,n1,n1
|
||
|
|
+ bne $6,$Odd
|
||
|
|
+ stl n1,0(rem_ptr)
|
||
|
|
+ bis $31,n0,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+$Odd:
|
||
|
|
+ /* q' in n0. r' in n1 */
|
||
|
|
+ addl n1,n0,n1
|
||
|
|
+
|
||
|
|
+ cmpult n1,n0,tmp # tmp := carry from addl
|
||
|
|
+ subl n1,d,AT
|
||
|
|
+ addl n0,tmp,n0
|
||
|
|
+ selne tmp,AT,n1,n1
|
||
|
|
+
|
||
|
|
+ cmpult n1,d,tmp
|
||
|
|
+ addl n0,1,AT
|
||
|
|
+ seleq tmp,AT,n0,n0
|
||
|
|
+ subl n1,d,AT
|
||
|
|
+ seleq tmp,AT,n1,n1
|
||
|
|
+
|
||
|
|
+ stl n1,0(rem_ptr)
|
||
|
|
+ bis $31,n0,$0
|
||
|
|
+ ret $31,($26),1
|
||
|
|
+
|
||
|
|
+ .end __udiv_qrnnd
|
||
|
|
--
|
||
|
|
2.25.1
|
||
|
|
|