openssl/add-sw_64-support.patch
Hailiang 0fae59f9c7 add sw_64 support
(cherry picked from commit 1409753b25c77862fefee90aa39a40dd258cffde)
2025-04-18 15:39:42 +08:00

1521 lines
30 KiB
Diff
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From dfb837f21d55b9cf0bf78e6b3d5dc02567672400 Mon Sep 17 00:00:00 2001
From: mahailiang <mahailiang@uniontech.com>
Date: Sun, 29 Sep 2024 17:26:05 +0800
Subject: [PATCH] to support sw_64
---
Configurations/10-main.conf | 12 +
crypto/bn/asm/sw_64-mont.pl | 328 ++++++++++++++++++++++
crypto/bn/bn_local.h | 2 +-
crypto/bn/build.info | 2 +-
crypto/modes/asm/ghash-sw_64.pl | 467 ++++++++++++++++++++++++++++++++
crypto/sha/asm/sha1-sw_64.pl | 329 ++++++++++++++++++++++
crypto/sha/build.info | 2 +-
crypto/sw_64cpuid.pl | 273 +++++++++++++++++++
include/crypto/md32_common.h | 2 +-
9 files changed, 1413 insertions(+), 4 deletions(-)
create mode 100644 crypto/bn/asm/sw_64-mont.pl
create mode 100644 crypto/modes/asm/ghash-sw_64.pl
create mode 100644 crypto/sha/asm/sha1-sw_64.pl
create mode 100644 crypto/sw_64cpuid.pl
diff --git a/Configurations/10-main.conf b/Configurations/10-main.conf
index 915e7dd..33fd760 100644
--- a/Configurations/10-main.conf
+++ b/Configurations/10-main.conf
@@ -984,6 +984,18 @@ my %targets = (
asm_arch => 'alpha',
perlasm_scheme => "void",
},
+ "linux-sw_64-gcc" => {
+ inherit_from => [ "linux-generic64" ],
+ lib_cppflags => add("-DL_ENDIAN"),
+ bn_ops => "SIXTY_FOUR_BIT_LONG",
+ },
+ "linux-sw_64" => {
+ inherit_from => [ "linux-generic64" ],
+ cflags => add("-DL_ENDIAN"),
+ bn_ops => "SIXTY_FOUR_BIT_LONG",
+ perlasm_scheme => "elf",
+ multilib => "64",
+ },
"linux-c64xplus" => {
inherit_from => [ "BASE_unix" ],
# TI_CGT_C6000_7.3.x is a requirement
diff --git a/crypto/bn/asm/sw_64-mont.pl b/crypto/bn/asm/sw_64-mont.pl
new file mode 100644
index 0000000..348b903
--- /dev/null
+++ b/crypto/bn/asm/sw_64-mont.pl
@@ -0,0 +1,328 @@
+#! /usr/bin/env perl
+# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+$output=pop;
+open STDOUT,">$output";
+
+# int bn_mul_mont(
+$rp="a0"; # BN_ULONG *rp,
+$ap="a1"; # const BN_ULONG *ap,
+$bp="a2"; # const BN_ULONG *bp,
+$np="a3"; # const BN_ULONG *np,
+$n0="a4"; # const BN_ULONG *n0,
+$num="a5"; # int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+
+.globl bn_mul_mont
+.align 5
+.ent bn_mul_mont
+bn_mul_mont:
+ ldi sp,-48(sp)
+ stl ra,0(sp)
+ stl s3,8(sp)
+ stl s4,16(sp)
+ stl s5,24(sp)
+ stl fp,32(sp)
+ mov sp,fp
+ .mask 0x0400f000,-48
+ .frame fp,48,ra
+ .prologue 0
+
+ .align 4
+ .set reorder
+ sextl $num,$num
+ mov 0,v0
+ cmplt $num,4,AT
+ bne AT,.Lexit
+
+ ldl $hi0,0($ap) # ap[0]
+ s8addl $num,16,AT
+ ldl $aj,8($ap)
+ subl sp,AT,sp
+ ldl $bi,0($bp) # bp[0]
+ ldi AT,-4096(zero) # mov -4096,AT
+ ldl $n0,0($n0)
+ and sp,AT,sp
+
+ mull $hi0,$bi,$lo0
+ ldl $hi1,0($np) # np[0]
+ umulh $hi0,$bi,$hi0
+ ldl $nj,8($np)
+
+ mull $lo0,$n0,$m1
+
+ mull $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addl $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ addl $hi1,AT,$hi1
+
+ mull $aj,$bi,$alo
+ mov 2,$j
+ umulh $aj,$bi,$ahi
+ mov sp,$tp
+
+ mull $nj,$m1,$nlo
+ s8addl $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+ s8addl $j,$np,$nj
+.align 4
+.L1st:
+ .set noreorder
+ ldl $aj,0($aj)
+ addw $j,1,$j
+ ldl $nj,0($nj)
+ ldi $tp,8($tp)
+
+ addl $alo,$hi0,$lo0
+ mull $aj,$bi,$alo
+ cmpult $lo0,$hi0,AT
+ addl $nlo,$hi1,$lo1
+
+ mull $nj,$m1,$nlo
+ addl $ahi,AT,$hi0
+ cmpult $lo1,$hi1,v0
+ cmplt $j,$num,$tj
+
+ umulh $aj,$bi,$ahi
+ addl $nhi,v0,$hi1
+ addl $lo1,$lo0,$lo1
+ s8addl $j,$ap,$aj
+
+ umulh $nj,$m1,$nhi
+ cmpult $lo1,$lo0,v0
+ addl $hi1,v0,$hi1
+ s8addl $j,$np,$nj
+
+ stl $lo1,-8($tp)
+ nop
+ unop
+ bne $tj,.L1st
+ .set reorder
+
+ addl $alo,$hi0,$lo0
+ addl $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addl $ahi,AT,$hi0
+ addl $nhi,v0,$hi1
+
+ addl $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,v0
+ addl $hi1,v0,$hi1
+
+ stl $lo1,0($tp)
+
+ addl $hi1,$hi0,$hi1
+ cmpult $hi1,$hi0,AT
+ stl $hi1,8($tp)
+ stl AT,16($tp)
+
+ mov 1,$i
+.align 4
+.Louter:
+ s8addl $i,$bp,$bi
+ ldl $hi0,0($ap)
+ ldl $aj,8($ap)
+ ldl $bi,0($bi)
+ ldl $hi1,0($np)
+ ldl $nj,8($np)
+ ldl $tj,0(sp)
+
+ mull $hi0,$bi,$lo0
+ umulh $hi0,$bi,$hi0
+
+ addl $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addl $hi0,AT,$hi0
+
+ mull $lo0,$n0,$m1
+
+ mull $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addl $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ mov 2,$j
+ addl $hi1,AT,$hi1
+
+ mull $aj,$bi,$alo
+ mov sp,$tp
+ umulh $aj,$bi,$ahi
+
+ mull $nj,$m1,$nlo
+ s8addl $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+.align 4
+.Linner:
+ .set noreorder
+ ldl $tj,8($tp) #L0
+ nop #U1
+ ldl $aj,0($aj) #L1
+ s8addl $j,$np,$nj #U0
+
+ ldl $nj,0($nj) #L0
+ nop #U1
+ addl $alo,$hi0,$lo0 #L1
+ ldi $tp,8($tp)
+
+ mull $aj,$bi,$alo #U1
+ cmpult $lo0,$hi0,AT #L0
+ addl $nlo,$hi1,$lo1 #L1
+ addw $j,1,$j
+
+ mull $nj,$m1,$nlo #U1
+ addl $ahi,AT,$hi0 #L0
+ addl $lo0,$tj,$lo0 #L1
+ cmpult $lo1,$hi1,v0 #U0
+
+ umulh $aj,$bi,$ahi #U1
+ cmpult $lo0,$tj,AT #L0
+ addl $lo1,$lo0,$lo1 #L1
+ addl $nhi,v0,$hi1 #U0
+
+ umulh $nj,$m1,$nhi #U1
+ s8addl $j,$ap,$aj #L0
+ cmpult $lo1,$lo0,v0 #L1
+ cmplt $j,$num,$tj #U0 # borrow $tj
+
+ addl $hi0,AT,$hi0 #L0
+ addl $hi1,v0,$hi1 #U1
+ stl $lo1,-8($tp) #L1
+ bne $tj,.Linner #U0
+ .set reorder
+
+ ldl $tj,8($tp)
+ addl $alo,$hi0,$lo0
+ addl $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addl $ahi,AT,$hi0
+ addl $nhi,v0,$hi1
+
+ addl $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addl $hi0,AT,$hi0
+
+ ldl $tj,16($tp)
+ addl $lo1,$lo0,$j
+ cmpult $j,$lo0,v0
+ addl $hi1,v0,$hi1
+
+ addl $hi1,$hi0,$lo1
+ stl $j,0($tp)
+ cmpult $lo1,$hi0,$hi1
+ addl $lo1,$tj,$lo1
+ cmpult $lo1,$tj,AT
+ addw $i,1,$i
+ addl $hi1,AT,$hi1
+ stl $lo1,8($tp)
+ cmplt $i,$num,$tj # borrow $tj
+ stl $hi1,16($tp)
+ bne $tj,.Louter
+
+ s8addl $num,sp,$tj # &tp[num]
+ mov $rp,$bp # put rp aside
+ mov sp,$tp
+ mov sp,$ap
+ mov 0,$hi0 # clear borrow bit
+
+.align 4
+.Lsub: ldl $lo0,0($tp)
+ ldl $lo1,0($np)
+ ldi $tp,8($tp)
+ ldi $np,8($np)
+ subl $lo0,$lo1,$lo1 # tp[i]-np[i]
+ cmpult $lo0,$lo1,AT
+ subl $lo1,$hi0,$lo0
+ cmpult $lo1,$lo0,$hi0
+ or $hi0,AT,$hi0
+ stl $lo0,0($rp)
+ cmpult $tp,$tj,v0
+ ldi $rp,8($rp)
+ bne v0,.Lsub
+
+ subl $hi1,$hi0,$hi0 # handle upmost overflow bit
+ mov sp,$tp
+ mov $bp,$rp # restore rp
+
+.align 4
+.Lcopy: ldl $aj,0($tp) # conditional copy
+ ldl $nj,0($rp)
+ ldi $tp,8($tp)
+ ldi $rp,8($rp)
+ seleq $hi0,$nj,$aj
+ stl zero,-8($tp) # zap tp
+ cmpult $tp,$tj,AT
+ stl $aj,-8($rp)
+ bne AT,.Lcopy
+ mov 1,v0
+
+.Lexit:
+ .set noreorder
+ mov fp,sp
+ /*ldl ra,0(sp)*/
+ ldl s3,8(sp)
+ ldl s4,16(sp)
+ ldl s5,24(sp)
+ ldl fp,32(sp)
+ ldi sp,48(sp)
+ ret (ra)
+.end bn_mul_mont
+.ascii "Montgomery Multiplication for Sw_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/bn/bn_local.h b/crypto/bn/bn_local.h
index 50e9d26..e73bd3f 100644
--- a/crypto/bn/bn_local.h
+++ b/crypto/bn/bn_local.h
@@ -387,7 +387,7 @@ struct bn_gencb_st {
# define BN_UMULT_LOHI(low,high,a,b) ({ \
uint128_t ret=(uint128_t)(a)*(b); \
(high)=ret>>64; (low)=ret; })
-# elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+# elif (defined(__alpha) || defined(__sw_64)) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
diff --git a/crypto/bn/build.info b/crypto/bn/build.info
index c4ba51b..b598423 100644
--- a/crypto/bn/build.info
+++ b/crypto/bn/build.info
@@ -168,7 +168,7 @@ GENERATE[ppc-mont.s]=asm/ppc-mont.pl
GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl
GENERATE[alpha-mont.S]=asm/alpha-mont.pl
-
+GENERATE[sw_64-mont.S]=asm/sw_64-mont.pl
GENERATE[armv4-mont.S]=asm/armv4-mont.pl
INCLUDE[armv4-mont.o]=..
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl
diff --git a/crypto/modes/asm/ghash-sw_64.pl b/crypto/modes/asm/ghash-sw_64.pl
new file mode 100644
index 0000000..59b5596
--- /dev/null
+++ b/crypto/modes/asm/ghash-sw_64.pl
@@ -0,0 +1,467 @@
+#! /usr/bin/env perl
+# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0"; # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3"; # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7"; # $8
+#################
+$Xi="a0"; # $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4"; # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10"; # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT"; # $28
+
+{ my $N;
+ sub loop() {
+
+ $N++;
+$code.=<<___;
+.align 4
+ extlb $Xlo,7,$nlo
+ and $nlo,0xf0,$nhi
+ sll $nlo,4,$nlo
+ and $nlo,0xf0,$nlo
+
+ addl $nlo,$Htbl,$nlo
+ ldl $Zlo,8($nlo)
+ addl $nhi,$Htbl,$nhi
+ ldl $Zhi,0($nlo)
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ ldi $cnt,6(zero)
+ extlb $Xlo,6,$nlo
+
+ ldl $Tlo1,8($nhi)
+ s8addl $remp,$rem_4bit,$remp
+ ldl $Thi1,0($nhi)
+ srl $Zlo,4,$Zlo
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ and $nlo,0xf0,$nhi
+
+ xor $Tlo1,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ xor $Thi1,$Zhi,$Zhi
+ and $nlo,0xf0,$nlo
+
+ addl $nlo,$Htbl,$nlo
+ ldl $Tlo0,8($nlo)
+ addl $nhi,$Htbl,$nhi
+ ldl $Thi0,0($nlo)
+
+.Looplo$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subl $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldl $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldl $Thi1,0($nhi)
+ s8addl $remp,$rem_4bit,$remp
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extlb $Xlo,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addl $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addl $nlo,$Htbl,$nlo
+ addl $nhi,$Htbl,$nhi
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldl $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldl $Thi0,0($nlo)
+ bne $cnt,.Looplo$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ ldi $cnt,7(zero)
+ srl $Zlo,4,$Zlo
+
+ ldl $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldl $Thi1,0($nhi)
+ s8addl $remp,$rem_4bit,$remp
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extlb $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addl $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addl $nlo,$Htbl,$nlo
+ addl $nhi,$Htbl,$nhi
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldl $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldl $Thi0,0($nlo)
+ unop
+
+
+.Loophi$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subl $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldl $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldl $Thi1,0($nhi)
+ s8addl $remp,$rem_4bit,$remp
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extlb $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addl $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addl $nlo,$Htbl,$nlo
+ addl $nhi,$Htbl,$nhi
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldl $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldl $Thi0,0($nlo)
+ bne $cnt,.Loophi$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ ldl $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldl $Thi1,0($nhi)
+ s8addl $remp,$rem_4bit,$remp
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo0,$Zlo,$Zlo
+ xor $Thi0,$Zhi,$Zhi
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ s8addl $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+
+ ldl $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ xor $t0,$Zlo,$Zlo
+ xor $rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.globl gcm_gmult_4bit
+.align 4
+.ent gcm_gmult_4bit
+gcm_gmult_4bit:
+ .frame sp,0,ra
+ .prologue 0
+
+ ldl $Xlo,8($Xi)
+ ldl $Xhi,0($Xi)
+
+ bsr $t0,picmeup
+ nop
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ stl $Xlo,8($Xi)
+ stl $Xhi,0($Xi)
+
+ ret (ra)
+.end gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.align 4
+.ent gcm_ghash_4bit
+gcm_ghash_4bit:
+ ldi sp,-32(sp)
+ stl ra,0(sp)
+ stl s0,8(sp)
+ stl s1,16(sp)
+ .mask 0x04000600,-32
+ .frame sp,32,ra
+ .prologue 0
+
+ ldl_u $inhi,0($inp)
+ ldl_u $Thi0,7($inp)
+ ldl_u $inlo,8($inp)
+ ldl_u $Tlo0,15($inp)
+ ldl $Xhi,0($Xi)
+ ldl $Xlo,8($Xi)
+
+ bsr $t0,picmeup
+ nop
+
+.Louter:
+ extll $inhi,$inp,$inhi
+ exthl $Thi0,$inp,$Thi0
+ or $inhi,$Thi0,$inhi
+ ldi $inp,16($inp)
+
+ extll $inlo,$inp,$inlo
+ exthl $Tlo0,$inp,$Tlo0
+ or $inlo,$Tlo0,$inlo
+ subl $len,16,$len
+
+ xor $Xlo,$inlo,$Xlo
+ xor $Xhi,$inhi,$Xhi
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+ beq $len,.Ldone
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+ ldl_u $inhi,0($inp)
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+ ldl_u $Thi0,7($inp)
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+ ldl_u $inlo,8($inp)
+ ldl_u $Tlo0,15($inp)
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ br zero,.Louter
+
+.Ldone:
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+
+ stl $Xlo,8($Xi)
+ stl $Xhi,0($Xi)
+
+ .set noreorder
+ /*ldl ra,0(sp)*/
+ ldl s0,8(sp)
+ ldl s1,16(sp)
+ ldi sp,32(sp)
+ ret (ra)
+.end gcm_ghash_4bit
+
+.align 4
+.ent picmeup
+picmeup:
+ .frame sp,0,$t0
+ .prologue 0
+ br $rem_4bit,.Lpic
+.Lpic: ldi $rem_4bit,12($rem_4bit)
+ ret ($t0)
+.end picmeup
+ nop
+rem_4bit:
+ .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
+ .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
+ .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
+ .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
+.ascii "GHASH for Sw_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+
+___
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
+
diff --git a/crypto/sha/asm/sha1-sw_64.pl b/crypto/sha/asm/sha1-sw_64.pl
new file mode 100644
index 0000000..cce4015
--- /dev/null
+++ b/crypto/sha/asm/sha1-sw_64.pl
@@ -0,0 +1,329 @@
+#! /usr/bin/env perl
+# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Sw_64.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
+ "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
+$ctx="a0"; # $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4"; # 20
+$C="a5";
+$D="t8";
+$E="t9"; @V=($A,$B,$C,$D,$E);
+$t0="t10"; # 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT"; # 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+ ldl_u @X[0],0+0($inp)
+ ldl_u @X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+ ldl_u @X[$i+2],($i+2)*4+0($inp)
+ ldl_u @X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+ extll @X[$i],$inp,@X[$i]
+ exthl @X[$i+1],$inp,@X[$i+1]
+
+ or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
+
+ srl @X[$i],24,$t0 # vectorized byte swap
+ srl @X[$i],8,$t2
+
+ sll @X[$i],8,$t3
+ sll @X[$i],24,@X[$i]
+ zapnot $t0,0x11,$t0
+ zapnot $t2,0x22,$t2
+
+ zapnot @X[$i],0x88,@X[$i]
+ or $t0,$t2,$t0
+ zapnot $t3,0x44,$t3
+ sll $a,5,$t1
+
+ or @X[$i],$t0,@X[$i]
+ addw $K,$e,$e
+ and $b,$c,$t2
+ zapnot $a,0xf,$a
+
+ or @X[$i],$t3,@X[$i]
+ srl $a,27,$t0
+ bic $d,$b,$t3
+ sll $b,30,$b
+
+ extll @X[$i],4,@X[$i+1] # extract upper half
+ or $t2,$t3,$t2
+ addw @X[$i],$e,$e
+
+ addw $t1,$e,$e
+ srl $b,32,$t3
+ zapnot @X[$i],0xf,@X[$i]
+
+ addw $t0,$e,$e
+ addw $t2,$e,$e
+ or $t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+ sll $a,5,$t1
+ addw $K,$e,$e
+ and $b,$c,$t2
+ zapnot $a,0xf,$a
+
+ srl $a,27,$t0
+ addw @X[$i%16],$e,$e
+ bic $d,$b,$t3
+ sll $b,30,$b
+
+ or $t2,$t3,$t2
+ addw $t1,$e,$e
+ srl $b,32,$t3
+ zapnot @X[$i],0xf,@X[$i]
+
+ addw $t0,$e,$e
+ addw $t2,$e,$e
+ or $t3,$b,$b
+___
+$code.=<<___ if ($i>=15); # with forward Xupdate
+ sll $a,5,$t1
+ addw $K,$e,$e
+ and $b,$c,$t2
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+ zapnot $a,0xf,$a
+ addw @X[$i%16],$e,$e
+ bic $d,$b,$t3
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+ srl $a,27,$t0
+ addw $t1,$e,$e
+ or $t2,$t3,$t2
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+ sll $b,30,$b
+ addw $t0,$e,$e
+ srl @X[$j%16],31,$t1
+
+ addw $t2,$e,$e
+ srl $b,32,$t3
+ addw @X[$j%16],@X[$j%16],@X[$j%16]
+
+ or $t3,$b,$b
+ zapnot @X[$i%16],0xf,@X[$i%16]
+ or $t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79); # with forward Xupdate
+ sll $a,5,$t1
+ addw $K,$e,$e
+ zapnot $a,0xf,$a
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+ sll $b,30,$t3
+ addw $t1,$e,$e
+ xor $b,$c,$t2
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+ srl $b,2,$b
+ addw @X[$i%16],$e,$e
+ xor $d,$t2,$t2
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+ srl @X[$j%16],31,$t1
+ addw $t2,$e,$e
+ srl $a,27,$t0
+ addw @X[$j%16],@X[$j%16],@X[$j%16]
+
+ or $t3,$b,$b
+ addw $t0,$e,$e
+ or $t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+ zapnot @X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79); # with context fetch
+ sll $a,5,$t1
+ addw $K,$e,$e
+ zapnot $a,0xf,$a
+ ldw @X[0],0($ctx)
+
+ sll $b,30,$t3
+ addw $t1,$e,$e
+ xor $b,$c,$t2
+ ldw @X[1],4($ctx)
+
+ srl $b,2,$b
+ addw @X[$i%16],$e,$e
+ xor $d,$t2,$t2
+ ldw @X[2],8($ctx)
+
+ srl $a,27,$t0
+ addw $t2,$e,$e
+ ldw @X[3],12($ctx)
+
+ or $t3,$b,$b
+ addw $t0,$e,$e
+ ldw @X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___; # with forward Xupdate
+ sll $a,5,$t1
+ addw $K,$e,$e
+ zapnot $a,0xf,$a
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+ srl $a,27,$t0
+ and $b,$c,$t2
+ and $b,$d,$t3
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+ sll $b,30,$b
+ addw $t1,$e,$e
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+ srl @X[$j%16],31,$t1
+ addw $t0,$e,$e
+ or $t2,$t3,$t2
+ and $c,$d,$t3
+
+ or $t2,$t3,$t2
+ srl $b,32,$t3
+ addw @X[$i%16],$e,$e
+ addw @X[$j%16],@X[$j%16],@X[$j%16]
+
+ or $t3,$b,$b
+ addw $t2,$e,$e
+ or $t1,@X[$j%16],@X[$j%16]
+ zapnot @X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.globl sha1_block_data_order
+.align 5
+.ent sha1_block_data_order
+sha1_block_data_order:
+ ldi sp,-64(sp)
+ stl ra,0(sp)
+ stl s0,8(sp)
+ stl s1,16(sp)
+ stl s2,24(sp)
+ stl s3,32(sp)
+ stl s4,40(sp)
+ stl s5,48(sp)
+ stl fp,56(sp)
+ .mask 0x0400fe00,-64
+ .frame sp,64,ra
+ .prologue 0
+
+ ldw $A,0($ctx)
+ ldw $B,4($ctx)
+ sll $num,6,$num
+ ldw $C,8($ctx)
+ ldw $D,12($ctx)
+ ldw $E,16($ctx)
+ addl $inp,$num,$num
+
+.Lloop:
+ .set noreorder
+ ldih $K,23170(zero)
+ zapnot $B,0xf,$B
+ ldi $K,31129($K) # K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ ldih $K,28378(zero)
+ ldi $K,-5215($K) # K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ ldih $K,-28900(zero)
+ ldi $K,-17188($K) # K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ ldih $K,-13725(zero)
+ ldi $K,-15914($K) # K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ addw @X[0],$A,$A
+ addw @X[1],$B,$B
+ addw @X[2],$C,$C
+ addw @X[3],$D,$D
+ addw @X[4],$E,$E
+ stw $A,0($ctx)
+ stw $B,4($ctx)
+ addl $inp,64,$inp
+ stw $C,8($ctx)
+ stw $D,12($ctx)
+ stw $E,16($ctx)
+ cmpult $inp,$num,$t1
+ bne $t1,.Lloop
+
+ .set noreorder
+ ldl ra,0(sp)
+ ldl s0,8(sp)
+ ldl s1,16(sp)
+ ldl s2,24(sp)
+ ldl s3,32(sp)
+ ldl s4,40(sp)
+ ldl s5,48(sp)
+ ldl fp,56(sp)
+ ldi sp,64(sp)
+ ret (ra)
+.end sha1_block_data_order
+.ascii "SHA1 block transform for Sw_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sha/build.info b/crypto/sha/build.info
index 556a658..9c29460 100644
--- a/crypto/sha/build.info
+++ b/crypto/sha/build.info
@@ -103,7 +103,7 @@ GENERATE[sha256-ia64.s]=asm/sha512-ia64.pl
GENERATE[sha512-ia64.s]=asm/sha512-ia64.pl
GENERATE[sha1-alpha.S]=asm/sha1-alpha.pl
-
+GENERATE[sha1-sw_64.S]=asm/sha1-sw_64.pl
GENERATE[sha1-x86_64.s]=asm/sha1-x86_64.pl
GENERATE[sha1-mb-x86_64.s]=asm/sha1-mb-x86_64.pl
GENERATE[sha256-x86_64.s]=asm/sha512-x86_64.pl
diff --git a/crypto/sw_64cpuid.pl b/crypto/sw_64cpuid.pl
new file mode 100644
index 0000000..0f2d44a
--- /dev/null
+++ b/crypto/sw_64cpuid.pl
@@ -0,0 +1,273 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+$output = pop;
+open STDOUT,">$output";
+
+print <<'___';
+.text
+
+.set noat
+
+.globl OPENSSL_cpuid_setup
+.ent OPENSSL_cpuid_setup
+OPENSSL_cpuid_setup:
+ .frame $30,0,$26
+ .prologue 0
+ ret ($26)
+.end OPENSSL_cpuid_setup
+
+.globl OPENSSL_wipe_cpu
+.ent OPENSSL_wipe_cpu
+OPENSSL_wipe_cpu:
+ .frame $30,0,$26
+ .prologue 0
+ clr $1
+ clr $2
+ clr $3
+ clr $4
+ clr $5
+ clr $6
+ clr $7
+ clr $8
+ clr $16
+ clr $17
+ clr $18
+ clr $19
+ clr $20
+ clr $21
+ clr $22
+ clr $23
+ clr $24
+ clr $25
+ clr $27
+ clr $at
+ clr $29
+ fclr $f0
+ fclr $f1
+ fclr $f10
+ fclr $f11
+ fclr $f12
+ fclr $f13
+ fclr $f14
+ fclr $f15
+ fclr $f16
+ fclr $f17
+ fclr $f18
+ fclr $f19
+ fclr $f20
+ fclr $f21
+ fclr $f22
+ fclr $f23
+ fclr $f24
+ fclr $f25
+ fclr $f26
+ fclr $f27
+ fclr $f28
+ fclr $f29
+ fclr $f30
+ mov $sp,$0
+ ret ($26)
+.end OPENSSL_wipe_cpu
+
+.globl OPENSSL_atomic_add
+.ent OPENSSL_atomic_add
+OPENSSL_atomic_add:
+ .frame $30,0,$26
+ .prologue 0
+1: lldw $0,0($16)
+ ldi $1,1
+ wr_f $1
+ addw $0,$17,$1
+ lstw $1,0($16)
+ rd_f $1
+ beq $1,1b
+ addw $0,$17,$0
+ ret ($26)
+.end OPENSSL_atomic_add
+
+.globl OPENSSL_rdtsc
+.ent OPENSSL_rdtsc
+OPENSSL_rdtsc:
+ .frame $30,0,$26
+ .prologue 0
+ rtc $0
+ ret ($26)
+.end OPENSSL_rdtsc
+
+.globl OPENSSL_cleanse
+.ent OPENSSL_cleanse
+OPENSSL_cleanse:
+ .frame $30,0,$26
+ .prologue 0
+ beq $17,.Ldone
+ and $16,7,$0
+ bic $17,7,$at
+ beq $at,.Little
+ beq $0,.Laligned
+
+.Little:
+ subl $0,8,$0
+ ldl_u $1,0($16)
+ mov $16,$2
+.Lalign:
+ masklb $1,$16,$1
+ ldi $16,1($16)
+ subl $17,1,$17
+ addl $0,1,$0
+ beq $17,.Lout
+ bne $0,.Lalign
+.Lout: stl_u $1,0($2)
+ beq $17,.Ldone
+ bic $17,7,$at
+ beq $at,.Little
+
+.Laligned:
+ stl $31,0($16)
+ subl $17,8,$17
+ ldi $16,8($16)
+ bic $17,7,$at
+ bne $at,.Laligned
+ bne $17,.Little
+.Ldone: ret ($26)
+.end OPENSSL_cleanse
+
+.globl CRYPTO_memcmp
+.ent CRYPTO_memcmp
+CRYPTO_memcmp:
+ .frame $30,0,$26
+ .prologue 0
+ xor $0,$0,$0
+ beq $18,.Lno_data
+
+ xor $1,$1,$1
+ nop
+.Loop_cmp:
+ ldl_u $2,0($16)
+ subl $18,1,$18
+ ldl_u $3,0($17)
+ extlb $2,$16,$2
+ ldi $16,1($16)
+ extlb $3,$17,$3
+ ldi $17,1($17)
+ xor $3,$2,$2
+ or $2,$0,$0
+ bne $18,.Loop_cmp
+
+ subl $31,$0,$0
+ srl $0,63,$0
+.Lno_data:
+ ret ($26)
+.end CRYPTO_memcmp
+___
+{
+my ($out,$cnt,$max)=("\$16","\$17","\$18");
+my ($tick,$lasttick)=("\$19","\$20");
+my ($diff,$lastdiff)=("\$21","\$22");
+my ($lock1,$lock2)=("\$23","\$24");
+my ($v0,$ra,$sp,$zero)=("\$0","\$26","\$30","\$31");
+
+print <<___;
+.globl OPENSSL_instrument_bus
+.ent OPENSSL_instrument_bus
+OPENSSL_instrument_bus:
+ .frame $sp,0,$ra
+ .prologue 0
+ mov $cnt,$v0
+
+ rtc $lasttick
+ mov 0,$diff
+
+ #ecb ($out)
+ lldw $tick,0($out)
+ ldi $lock1,1
+ wr_f $lock1
+ addw $diff,$tick,$tick
+ mov $tick,$diff
+ lstw $tick,0($out)
+ rd_f $tick
+ stw $diff,0($out)
+
+.Loop: rtc $tick
+ subl $tick,$lasttick,$diff
+ mov $tick,$lasttick
+
+ #ecb ($out)
+ lldw $tick,0($out)
+ ldi $lock1,1
+ wr_f $lock1
+ addw $diff,$tick,$tick
+ mov $tick,$diff
+ lstw $tick,0($out)
+ rd_f $tick
+ stw $diff,0($out)
+
+ subw $cnt,1,$cnt
+ ldi $out,4($out)
+ bne $cnt,.Loop
+
+ ret ($ra)
+.end OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.ent OPENSSL_instrument_bus2
+OPENSSL_instrument_bus2:
+ .frame $sp,0,$ra
+ .prologue 0
+ mov $cnt,$v0
+
+ rtc $lasttick
+ mov 0,$diff
+
+ #ecb ($out)
+ lldw $tick,0($out)
+ ldi $lock1,1
+ wr_f $lock1
+ addw $diff,$tick,$tick
+ mov $tick,$diff
+ lstw $tick,0($out)
+ rd_f $tick
+ stw $diff,0($out)
+
+ rtc $tick
+ subl $tick,$lasttick,$diff
+ mov $tick,$lasttick
+ mov $diff,$lastdiff
+.Loop2:
+ #ecb ($out)
+ lldw $tick,0($out)
+ ldi $lock1,1
+ wr_f $lock1
+ addw $diff,$tick,$tick
+ mov $tick,$diff
+ lstw $tick,0($out)
+ rd_f $tick
+ stw $diff,0($out)
+
+ subw $max,1,$max
+ beq $max,.Ldone2
+
+ rtc $tick
+ subl $tick,$lasttick,$diff
+ mov $tick,$lasttick
+ subl $lastdiff,$diff,$tick
+ mov $diff,$lastdiff
+ selne $tick,1,$tick
+ subw $cnt,$tick,$cnt
+ s4addl $tick,$out,$out
+ bne $cnt,.Loop2
+
+.Ldone2:
+ subw $v0,$cnt,$v0
+ ret ($ra)
+.end OPENSSL_instrument_bus2
+___
+}
+
+close STDOUT;
diff --git a/include/crypto/md32_common.h b/include/crypto/md32_common.h
index 3b16f1b..84dc45a 100644
--- a/include/crypto/md32_common.h
+++ b/include/crypto/md32_common.h
@@ -226,7 +226,7 @@ int HASH_FINAL(unsigned char *md, HASH_CTX *c)
}
#ifndef MD32_REG_T
-# if defined(__alpha) || defined(__sparcv9) || defined(__mips)
+# if defined(__alpha) || defined(__sw_64) || defined(__sparcv9) || defined(__mips)
# define MD32_REG_T long
/*
* This comment was originally written for MD5, which is why it
--
2.27.0