!163 SM3/SM4 optimization for ARM by HW instruction
From: @xu-yi-zhou Reviewed-by: @zhujianwei001 Signed-off-by: @zhujianwei001
This commit is contained in:
commit
4c696367a7
@ -0,0 +1,492 @@
|
||||
From 4d2e328357ac4b468d4762a5a5f615d7e7bf46a6 Mon Sep 17 00:00:00 2001
|
||||
From: Xu Yizhou <xuyizhou1@huawei.com>
|
||||
Date: Thu, 27 Oct 2022 20:49:34 +0800
|
||||
Subject: [PATCH 1/3] SM3 acceleration with SM3 hardware instruction on aarch64
|
||||
|
||||
This patch contains the following two PRs,
|
||||
|
||||
1. SM3 acceleration with SM3 hardware instruction on aarch64
|
||||
|
||||
SM3 hardware instruction is optional feature of crypto extension for
|
||||
aarch64. This implementation accelerates SM3 via SM3 instructions. For
|
||||
the platform not supporting SM3 instruction, the original C
|
||||
implementation still works. Thanks to AliBaba for testing and reporting
|
||||
the following perf numbers for Yitian710:
|
||||
|
||||
Benchmark on T-Head Yitian-710 2.75GHz:
|
||||
|
||||
Before:
|
||||
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
|
||||
sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k
|
||||
|
||||
After (33% - 74% faster):
|
||||
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
|
||||
sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k
|
||||
|
||||
Reviewed-by: Paul Dale <pauli@openssl.org>
|
||||
Reviewed-by: Tomas Mraz <tomas@openssl.org>
|
||||
(Merged from https://github.com/openssl/openssl/pull/17454)
|
||||
|
||||
2. Fix sm3ss1 translation issue in sm3-armv8.pl
|
||||
|
||||
Reviewed-by: Tomas Mraz <tomas@openssl.org>
|
||||
Reviewed-by: Matt Caswell <matt@openssl.org>
|
||||
Reviewed-by: Paul Dale <pauli@openssl.org>
|
||||
(Merged from https://github.com/openssl/openssl/pull/17542)
|
||||
|
||||
Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
|
||||
---
|
||||
Configurations/00-base-templates.conf | 1 +
|
||||
Configure | 4 +
|
||||
crypto/arm64cpuid.pl | 7 +
|
||||
crypto/arm_arch.h | 1 +
|
||||
crypto/armcap.c | 10 +
|
||||
crypto/sm3/asm/sm3-armv8.pl | 280 ++++++++++++++++++++++++++
|
||||
crypto/sm3/build.info | 15 +-
|
||||
crypto/sm3/sm3_local.h | 16 +-
|
||||
8 files changed, 332 insertions(+), 2 deletions(-)
|
||||
create mode 100644 crypto/sm3/asm/sm3-armv8.pl
|
||||
|
||||
diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
|
||||
index 1d35012..a67ae65 100644
|
||||
--- a/Configurations/00-base-templates.conf
|
||||
+++ b/Configurations/00-base-templates.conf
|
||||
@@ -322,6 +322,7 @@ my %targets=(
|
||||
poly1305_asm_src=> "poly1305-armv8.S",
|
||||
keccak1600_asm_src => "keccak1600-armv8.S",
|
||||
sm4_asm_src => "vpsm4_ex-armv8.S",
|
||||
+ sm3_asm_src => "sm3-armv8.S",
|
||||
},
|
||||
parisc11_asm => {
|
||||
template => 1,
|
||||
diff --git a/Configure b/Configure
|
||||
index 3bfe360..fce460d 100755
|
||||
--- a/Configure
|
||||
+++ b/Configure
|
||||
@@ -1423,6 +1423,9 @@ unless ($disabled{asm}) {
|
||||
if ($target{sm4_asm_src} ne "") {
|
||||
push @{$config{lib_defines}}, "VPSM4_EX_ASM";
|
||||
}
|
||||
+ if ($target{sm3_asm_src} ne "") {
|
||||
+ push @{$config{lib_defines}}, "SM3_ASM";
|
||||
+ }
|
||||
}
|
||||
|
||||
my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC});
|
||||
@@ -3379,6 +3382,7 @@ sub print_table_entry
|
||||
"multilib",
|
||||
"build_scheme",
|
||||
"sm4_asm_src",
|
||||
+ "sm3_asm_src",
|
||||
);
|
||||
|
||||
if ($type eq "TABLE") {
|
||||
diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
|
||||
index 319927e..1e9b167 100755
|
||||
--- a/crypto/arm64cpuid.pl
|
||||
+++ b/crypto/arm64cpuid.pl
|
||||
@@ -78,6 +78,13 @@ _armv8_sha512_probe:
|
||||
ret
|
||||
.size _armv8_sha512_probe,.-_armv8_sha512_probe
|
||||
|
||||
+.globl _armv8_sm3_probe
|
||||
+.type _armv8_sm3_probe,%function
|
||||
+_armv8_sm3_probe:
|
||||
+ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s
|
||||
+ ret
|
||||
+.size _armv8_sm3_probe,.-_armv8_sm3_probe
|
||||
+
|
||||
.globl OPENSSL_cleanse
|
||||
.type OPENSSL_cleanse,%function
|
||||
.align 5
|
||||
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
|
||||
index 8b71055..8839b21 100644
|
||||
--- a/crypto/arm_arch.h
|
||||
+++ b/crypto/arm_arch.h
|
||||
@@ -80,5 +80,6 @@ extern unsigned int OPENSSL_armcap_P;
|
||||
# define ARMV8_SHA256 (1<<4)
|
||||
# define ARMV8_PMULL (1<<5)
|
||||
# define ARMV8_SHA512 (1<<6)
|
||||
+# define ARMV8_SM3 (1<<9)
|
||||
|
||||
#endif
|
||||
diff --git a/crypto/armcap.c b/crypto/armcap.c
|
||||
index 48c5d4d..8b2f4a5 100644
|
||||
--- a/crypto/armcap.c
|
||||
+++ b/crypto/armcap.c
|
||||
@@ -47,6 +47,7 @@ void _armv8_sha1_probe(void);
|
||||
void _armv8_sha256_probe(void);
|
||||
void _armv8_pmull_probe(void);
|
||||
# ifdef __aarch64__
|
||||
+void _armv8_sm3_probe(void);
|
||||
void _armv8_sha512_probe(void);
|
||||
# endif
|
||||
uint32_t _armv7_tick(void);
|
||||
@@ -130,6 +131,7 @@ static unsigned long getauxval(unsigned long key)
|
||||
# define HWCAP_CE_PMULL (1 << 4)
|
||||
# define HWCAP_CE_SHA1 (1 << 5)
|
||||
# define HWCAP_CE_SHA256 (1 << 6)
|
||||
+# define HWCAP_CE_SM3 (1 << 18)
|
||||
# define HWCAP_CE_SHA512 (1 << 21)
|
||||
# endif
|
||||
|
||||
@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void)
|
||||
# ifdef __aarch64__
|
||||
if (hwcap & HWCAP_CE_SHA512)
|
||||
OPENSSL_armcap_P |= ARMV8_SHA512;
|
||||
+
|
||||
+ if (hwcap & HWCAP_CE_SM3)
|
||||
+ OPENSSL_armcap_P |= ARMV8_SM3;
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
@@ -233,6 +238,11 @@ void OPENSSL_cpuid_setup(void)
|
||||
_armv8_sha512_probe();
|
||||
OPENSSL_armcap_P |= ARMV8_SHA512;
|
||||
}
|
||||
+
|
||||
+ if (sigsetjmp(ill_jmp, 1) == 0) {
|
||||
+ _armv8_sm3_probe();
|
||||
+ OPENSSL_armcap_P |= ARMV8_SM3;
|
||||
+ }
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
|
||||
new file mode 100644
|
||||
index 0000000..677ca52
|
||||
--- /dev/null
|
||||
+++ b/crypto/sm3/asm/sm3-armv8.pl
|
||||
@@ -0,0 +1,280 @@
|
||||
+#! /usr/bin/env perl
|
||||
+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
|
||||
+#
|
||||
+# Licensed under the Apache License 2.0 (the "License"). You may not use
|
||||
+# this file except in compliance with the License. You can obtain a copy
|
||||
+# in the file LICENSE in the source distribution or at
|
||||
+# https://www.openssl.org/source/license.html
|
||||
+#
|
||||
+# This module implements support for Armv8 SM3 instructions
|
||||
+
|
||||
+# $output is the last argument if it looks like a file (it has an extension)
|
||||
+# $flavour is the first argument if it doesn't look like a file
|
||||
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
||||
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
||||
+
|
||||
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
+die "can't locate arm-xlate.pl";
|
||||
+
|
||||
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
||||
+ or die "can't call $xlate: $!";
|
||||
+*STDOUT=*OUT;
|
||||
+
|
||||
+# Message expanding:
|
||||
+# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
|
||||
+# Input: s0, s1, s2, s3
|
||||
+# s0 = w0 | w1 | w2 | w3
|
||||
+# s1 = w4 | w5 | w6 | w7
|
||||
+# s2 = w8 | w9 | w10 | w11
|
||||
+# s3 = w12 | w13 | w14 | w15
|
||||
+# Output: s4
|
||||
+sub msg_exp () {
|
||||
+my $s0 = shift;
|
||||
+my $s1 = shift;
|
||||
+my $s2 = shift;
|
||||
+my $s3 = shift;
|
||||
+my $s4 = shift;
|
||||
+my $vtmp1 = shift;
|
||||
+my $vtmp2 = shift;
|
||||
+$code.=<<___;
|
||||
+ // s4 = w7 | w8 | w9 | w10
|
||||
+ ext $s4.16b, $s1.16b, $s2.16b, #12
|
||||
+ // vtmp1 = w3 | w4 | w5 | w6
|
||||
+ ext $vtmp1.16b, $s0.16b, $s1.16b, #12
|
||||
+ // vtmp2 = w10 | w11 | w12 | w13
|
||||
+ ext $vtmp2.16b, $s2.16b, $s3.16b, #8
|
||||
+ sm3partw1 $s4.4s, $s0.4s, $s3.4s
|
||||
+ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s
|
||||
+___
|
||||
+}
|
||||
+
|
||||
+# A round of compresson function
|
||||
+# Input:
|
||||
+# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
|
||||
+# vstate0 - vstate1, store digest status(A - H)
|
||||
+# vconst0 - vconst1, interleaved used to store Tj <<< j
|
||||
+# vtmp - temporary register
|
||||
+# vw - for sm3tt1ab, vw = s0 eor s1
|
||||
+# s0 - for sm3tt2ab, just be s0
|
||||
+# i, choose wj' or wj from vw
|
||||
+sub round () {
|
||||
+my $ab = shift;
|
||||
+my $vstate0 = shift;
|
||||
+my $vstate1 = shift;
|
||||
+my $vconst0 = shift;
|
||||
+my $vconst1 = shift;
|
||||
+my $vtmp = shift;
|
||||
+my $vw = shift;
|
||||
+my $s0 = shift;
|
||||
+my $i = shift;
|
||||
+$code.=<<___;
|
||||
+ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
|
||||
+ shl $vconst1.4s, $vconst0.4s, #1
|
||||
+ sri $vconst1.4s, $vconst0.4s, #31
|
||||
+ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i]
|
||||
+ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i]
|
||||
+___
|
||||
+}
|
||||
+
|
||||
+sub qround () {
|
||||
+my $ab = shift;
|
||||
+my $vstate0 = shift;
|
||||
+my $vstate1 = shift;
|
||||
+my $vconst0 = shift;
|
||||
+my $vconst1 = shift;
|
||||
+my $vtmp1 = shift;
|
||||
+my $vtmp2 = shift;
|
||||
+my $s0 = shift;
|
||||
+my $s1 = shift;
|
||||
+my $s2 = shift;
|
||||
+my $s3 = shift;
|
||||
+my $s4 = shift;
|
||||
+ if($s4) {
|
||||
+ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
|
||||
+ }
|
||||
+$code.=<<___;
|
||||
+ eor $vtmp1.16b, $s0.16b, $s1.16b
|
||||
+___
|
||||
+ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
|
||||
+ $vtmp1, $s0, 0);
|
||||
+ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
|
||||
+ $vtmp1, $s0, 1);
|
||||
+ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
|
||||
+ $vtmp1, $s0, 2);
|
||||
+ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
|
||||
+ $vtmp1, $s0, 3);
|
||||
+}
|
||||
+
|
||||
+$code=<<___;
|
||||
+#include "arm_arch.h"
|
||||
+.arch armv8.2-a
|
||||
+.text
|
||||
+___
|
||||
+
|
||||
+{{{
|
||||
+my ($pstate,$pdata,$num)=("x0","x1","w2");
|
||||
+my ($state1,$state2)=("v5","v6");
|
||||
+my ($sconst1, $sconst2)=("s16","s17");
|
||||
+my ($vconst1, $vconst2)=("v16","v17");
|
||||
+my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
|
||||
+my ($bkstate1,$bkstate2)=("v18","v19");
|
||||
+my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
|
||||
+my ($vtmp1,$vtmp2)=("v22","v23");
|
||||
+my $constaddr="x8";
|
||||
+# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
|
||||
+$code.=<<___;
|
||||
+.globl ossl_hwsm3_block_data_order
|
||||
+.type ossl_hwsm3_block_data_order,%function
|
||||
+.align 5
|
||||
+ossl_hwsm3_block_data_order:
|
||||
+ // load state
|
||||
+ ld1 {$state1.4s-$state2.4s}, [$pstate]
|
||||
+ rev64 $state1.4s, $state1.4s
|
||||
+ rev64 $state2.4s, $state2.4s
|
||||
+ ext $state1.16b, $state1.16b, $state1.16b, #8
|
||||
+ ext $state2.16b, $state2.16b, $state2.16b, #8
|
||||
+
|
||||
+ adr $constaddr, .Tj
|
||||
+ ldp $sconst1, $sconst2, [$constaddr]
|
||||
+
|
||||
+.Loop:
|
||||
+ // load input
|
||||
+ ld1 {$s0.16b-$s3.16b}, [$pdata], #64
|
||||
+ sub $num, $num, #1
|
||||
+
|
||||
+ mov $bkstate1.16b, $state1.16b
|
||||
+ mov $bkstate2.16b, $state2.16b
|
||||
+
|
||||
+#ifndef __ARMEB__
|
||||
+ rev32 $s0.16b, $s0.16b
|
||||
+ rev32 $s1.16b, $s1.16b
|
||||
+ rev32 $s2.16b, $s2.16b
|
||||
+ rev32 $s3.16b, $s3.16b
|
||||
+#endif
|
||||
+
|
||||
+ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
|
||||
+___
|
||||
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s0,$s1,$s2,$s3,$s4);
|
||||
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s1,$s2,$s3,$s4,$s0);
|
||||
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s2,$s3,$s4,$s0,$s1);
|
||||
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s3,$s4,$s0,$s1,$s2);
|
||||
+
|
||||
+$code.=<<___;
|
||||
+ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
|
||||
+___
|
||||
+
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s4,$s0,$s1,$s2,$s3);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s0,$s1,$s2,$s3,$s4);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s1,$s2,$s3,$s4,$s0);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s2,$s3,$s4,$s0,$s1);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s3,$s4,$s0,$s1,$s2);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s4,$s0,$s1,$s2,$s3);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s0,$s1,$s2,$s3,$s4);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s1,$s2,$s3,$s4,$s0);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s2,$s3,$s4,$s0,$s1);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s3,$s4);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s4,$s0);
|
||||
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
|
||||
+ $s0,$s1);
|
||||
+
|
||||
+$code.=<<___;
|
||||
+ eor $state1.16b, $state1.16b, $bkstate1.16b
|
||||
+ eor $state2.16b, $state2.16b, $bkstate2.16b
|
||||
+
|
||||
+ // any remained blocks?
|
||||
+ cbnz $num, .Loop
|
||||
+
|
||||
+ // save state
|
||||
+ rev64 $state1.4s, $state1.4s
|
||||
+ rev64 $state2.4s, $state2.4s
|
||||
+ ext $state1.16b, $state1.16b, $state1.16b, #8
|
||||
+ ext $state2.16b, $state2.16b, $state2.16b, #8
|
||||
+ st1 {$state1.4s-$state2.4s}, [$pstate]
|
||||
+ ret
|
||||
+.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
|
||||
+
|
||||
+.align 3
|
||||
+.Tj:
|
||||
+.word 0x79cc4519, 0x9d8a7a87
|
||||
+___
|
||||
+}}}
|
||||
+
|
||||
+#########################################
|
||||
+my %sm3partopcode = (
|
||||
+ "sm3partw1" => 0xce60C000,
|
||||
+ "sm3partw2" => 0xce60C400);
|
||||
+
|
||||
+my %sm3ss1opcode = (
|
||||
+ "sm3ss1" => 0xce400000);
|
||||
+
|
||||
+my %sm3ttopcode = (
|
||||
+ "sm3tt1a" => 0xce408000,
|
||||
+ "sm3tt1b" => 0xce408400,
|
||||
+ "sm3tt2a" => 0xce408800,
|
||||
+ "sm3tt2b" => 0xce408C00);
|
||||
+
|
||||
+sub unsm3part {
|
||||
+ my ($mnemonic,$arg)=@_;
|
||||
+
|
||||
+ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
|
||||
+ &&
|
||||
+ sprintf ".inst\t0x%08x\t//%s %s",
|
||||
+ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
||||
+ $mnemonic,$arg;
|
||||
+}
|
||||
+
|
||||
+sub unsm3ss1 {
|
||||
+ my ($mnemonic,$arg)=@_;
|
||||
+
|
||||
+ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
|
||||
+ &&
|
||||
+ sprintf ".inst\t0x%08x\t//%s %s",
|
||||
+ $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
|
||||
+ $mnemonic,$arg;
|
||||
+}
|
||||
+
|
||||
+sub unsm3tt {
|
||||
+ my ($mnemonic,$arg)=@_;
|
||||
+
|
||||
+ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
|
||||
+ &&
|
||||
+ sprintf ".inst\t0x%08x\t//%s %s",
|
||||
+ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
|
||||
+ $mnemonic,$arg;
|
||||
+}
|
||||
+
|
||||
+open SELF,$0;
|
||||
+while(<SELF>) {
|
||||
+ next if (/^#!/);
|
||||
+ last if (!s/^#/\/\// and !/^$/);
|
||||
+ print;
|
||||
+}
|
||||
+close SELF;
|
||||
+
|
||||
+foreach(split("\n",$code)) {
|
||||
+ s/\`([^\`]*)\`/eval($1)/ge;
|
||||
+
|
||||
+ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
|
||||
+ s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
|
||||
+ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
|
||||
+ print $_,"\n";
|
||||
+}
|
||||
+
|
||||
+close STDOUT or die "error closing STDOUT: $!";
|
||||
diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
|
||||
index 6009b19..e113729 100644
|
||||
--- a/crypto/sm3/build.info
|
||||
+++ b/crypto/sm3/build.info
|
||||
@@ -1,2 +1,15 @@
|
||||
LIBS=../../libcrypto
|
||||
-SOURCE[../../libcrypto]=sm3.c m_sm3.c
|
||||
+SOURCE[../../libcrypto]=\
|
||||
+ sm3.c m_sm3.c {- $target{sm3_asm_src} -}
|
||||
+
|
||||
+GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME)
|
||||
+INCLUDE[sm3-armv8.o]=..
|
||||
+
|
||||
+BEGINRAW[Makefile]
|
||||
+##### SM3 assembler implementations
|
||||
+
|
||||
+# GNU make "catch all"
|
||||
+{- $builddir -}/sm3-%.S: {- $sourcedir -}/asm/sm3-%.pl
|
||||
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
|
||||
+
|
||||
+ENDRAW[Makefile]
|
||||
\ No newline at end of file
|
||||
diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
|
||||
index 7171de5..aafff63 100644
|
||||
--- a/crypto/sm3/sm3_local.h
|
||||
+++ b/crypto/sm3/sm3_local.h
|
||||
@@ -32,7 +32,21 @@
|
||||
ll=(c)->G; (void)HOST_l2c(ll, (s)); \
|
||||
ll=(c)->H; (void)HOST_l2c(ll, (s)); \
|
||||
} while (0)
|
||||
-#define HASH_BLOCK_DATA_ORDER sm3_block_data_order
|
||||
+
|
||||
+#if defined(SM3_ASM)
|
||||
+# if defined(__aarch64__)
|
||||
+# include "crypto/arm_arch.h"
|
||||
+# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
|
||||
+void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
+#if defined(HWSM3_CAPABLE)
|
||||
+# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
|
||||
+ : sm3_block_data_order)
|
||||
+#else
|
||||
+# define HASH_BLOCK_DATA_ORDER sm3_block_data_order
|
||||
+#endif
|
||||
|
||||
void sm3_transform(SM3_CTX *c, const unsigned char *data);
|
||||
|
||||
--
|
||||
2.36.1
|
||||
|
||||
1032
Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
Normal file
1032
Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
Normal file
File diff suppressed because it is too large
Load Diff
621
Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
Normal file
621
Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
Normal file
@ -0,0 +1,621 @@
|
||||
From 3f0898b2aea424f18f58a182803478f25548674e Mon Sep 17 00:00:00 2001
|
||||
From: Xu Yizhou <xuyizhou1@huawei.com>
|
||||
Date: Wed, 2 Nov 2022 11:13:07 +0800
|
||||
Subject: [PATCH 3/3] SM4 XTS optimization for ARM by HW instruction
|
||||
|
||||
This patch implements the SM4 XTS optimization for ARM processor,
|
||||
using SM4 HW instruction, which is an optional feature of
|
||||
crypto extension for aarch64 V8.
|
||||
|
||||
Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
|
||||
---
|
||||
crypto/evp/e_sm4.c | 28 ++
|
||||
crypto/sm4/asm/sm4-armv8.pl | 498 +++++++++++++++++++++++++++++++++-
|
||||
include/crypto/sm4_platform.h | 14 +
|
||||
3 files changed, 537 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
|
||||
index eaa5ba0..da4dbd3 100644
|
||||
--- a/crypto/evp/e_sm4.c
|
||||
+++ b/crypto/evp/e_sm4.c
|
||||
@@ -281,6 +281,34 @@ static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
|
||||
const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2;
|
||||
xctx->stream_gb = NULL;
|
||||
xctx->stream = NULL;
|
||||
+#ifdef HWSM4_CAPABLE
|
||||
+ if (HWSM4_CAPABLE) {
|
||||
+ if (enc) {
|
||||
+ HWSM4_set_encrypt_key(key, &xctx->ks1.ks);
|
||||
+ xctx->xts.block1 = (block128_f) HWSM4_encrypt;
|
||||
+# ifdef HWSM4_xts_encrypt_gb
|
||||
+ xctx->stream_gb = HWSM4_xts_encrypt_gb;
|
||||
+# endif
|
||||
+# ifdef HWSM4_xts_encrypt
|
||||
+ xctx->stream = HWSM4_xts_encrypt;
|
||||
+# endif
|
||||
+ } else {
|
||||
+ HWSM4_set_decrypt_key(key, &xctx->ks1.ks);
|
||||
+ xctx->xts.block1 = (block128_f) HWSM4_decrypt;
|
||||
+# ifdef HWSM4_xts_decrypt_gb
|
||||
+ xctx->stream_gb = HWSM4_xts_decrypt_gb;
|
||||
+# endif
|
||||
+# ifdef HWSM4_xts_decrypt
|
||||
+ xctx->stream = HWSM4_xts_decrypt;
|
||||
+# endif
|
||||
+ }
|
||||
+ HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks);
|
||||
+ xctx->xts.block2 = (block128_f) HWSM4_encrypt;
|
||||
+
|
||||
+ xctx->xts.key1 = &xctx->ks1;
|
||||
+ break;
|
||||
+ } else
|
||||
+#endif
|
||||
#ifdef VPSM4_EX_CAPABLE
|
||||
if (VPSM4_EX_CAPABLE) {
|
||||
if (enc) {
|
||||
diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
|
||||
index dbacad2..923c1c0 100644
|
||||
--- a/crypto/sm4/asm/sm4-armv8.pl
|
||||
+++ b/crypto/sm4/asm/sm4-armv8.pl
|
||||
@@ -11,9 +11,9 @@
|
||||
# Oct 2021
|
||||
#
|
||||
|
||||
-# $output is the last argument if it looks like a file (it has an extension)
|
||||
+# $outut is the last argument if it looks like a file (it has an extension)
|
||||
# $flavour is the first argument if it doesn't look like a file
|
||||
-$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
||||
+$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
||||
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
@@ -21,7 +21,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
-open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
||||
+open OUT,"| \"$^X\" $xlate $flavour \"$outut\""
|
||||
or die "can't call $xlate: $!";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
@@ -110,6 +110,120 @@ $code.=<<___;
|
||||
___
|
||||
}
|
||||
|
||||
+sub mov_reg_to_vec() {
|
||||
+ my $src0 = shift;
|
||||
+ my $src1 = shift;
|
||||
+ my $desv = shift;
|
||||
+$code.=<<___;
|
||||
+ mov $desv.d[0],$src0
|
||||
+ mov $desv.d[1],$src1
|
||||
+#ifdef __ARMEB__
|
||||
+ rev32 $desv.16b,$desv.16b
|
||||
+#endif
|
||||
+___
|
||||
+}
|
||||
+
|
||||
+sub mov_vec_to_reg() {
|
||||
+ my $srcv = shift;
|
||||
+ my $des0 = shift;
|
||||
+ my $des1 = shift;
|
||||
+$code.=<<___;
|
||||
+ mov $des0,$srcv.d[0]
|
||||
+ mov $des1,$srcv.d[1]
|
||||
+___
|
||||
+}
|
||||
+
|
||||
+sub compute_tweak() {
|
||||
+ my $src0 = shift;
|
||||
+ my $src1 = shift;
|
||||
+ my $des0 = shift;
|
||||
+ my $des1 = shift;
|
||||
+ my $tmp0 = shift;
|
||||
+ my $tmp1 = shift;
|
||||
+ my $magic = shift;
|
||||
+$code.=<<___;
|
||||
+ extr x$tmp1,$src1,$src1,#32
|
||||
+ extr $des1,$src1,$src0,#63
|
||||
+ and w$tmp0,w$magic,w$tmp1,asr#31
|
||||
+ eor $des0,x$tmp0,$src0,lsl#1
|
||||
+___
|
||||
+}
|
||||
+
|
||||
+sub compute_tweak_vec() {
|
||||
+ my $src = shift;
|
||||
+ my $des = shift;
|
||||
+ my $tmp0 = shift;
|
||||
+ my $tmp1 = shift;
|
||||
+ my $magic = shift;
|
||||
+ &rbit($tmp1,$src);
|
||||
+$code.=<<___;
|
||||
+ shl $des.16b, $tmp1.16b, #1
|
||||
+ ext $tmp0.16b, $tmp1.16b, $tmp1.16b,#15
|
||||
+ ushr $tmp0.16b, $tmp0.16b, #7
|
||||
+ mul $tmp0.16b, $tmp0.16b, $magic.16b
|
||||
+ eor $des.16b, $des.16b, $tmp0.16b
|
||||
+___
|
||||
+ &rbit($des,$des);
|
||||
+}
|
||||
+
|
||||
+sub mov_en_to_enc(){
|
||||
+ my $en = shift;
|
||||
+ my $enc = shift;
|
||||
+ if ($en eq "en") {
|
||||
+$code.=<<___;
|
||||
+ mov $enc,1
|
||||
+___
|
||||
+ } else {
|
||||
+$code.=<<___;
|
||||
+ mov $enc,0
|
||||
+___
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+sub rbit() {
|
||||
+ my $dst = shift;
|
||||
+ my $src = shift;
|
||||
+
|
||||
+ if ($src and ("$src" ne "$dst")) {
|
||||
+ if ($standard eq "_gb") {
|
||||
+$code.=<<___;
|
||||
+ rbit $dst.16b,$src.16b
|
||||
+___
|
||||
+ } else {
|
||||
+$code.=<<___;
|
||||
+ mov $dst.16b,$src.16b
|
||||
+___
|
||||
+ }
|
||||
+ } else {
|
||||
+ if ($standard eq "_gb") {
|
||||
+$code.=<<___;
|
||||
+ rbit $dst.16b,$src.16b
|
||||
+___
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+sub rev32_armeb() {
|
||||
+ my $dst = shift;
|
||||
+ my $src = shift;
|
||||
+
|
||||
+ if ($src and ("$src" ne "$dst")) {
|
||||
+$code.=<<___;
|
||||
+#ifdef __ARMEB__
|
||||
+ rev32 $dst.16b,$src.16b
|
||||
+#else
|
||||
+ mov $dst.16b,$src.16b
|
||||
+#endif
|
||||
+___
|
||||
+ } else {
|
||||
+$code.=<<___;
|
||||
+#ifdef __ARMEB__
|
||||
+ rev32 $dst.16b,$dst.16b
|
||||
+#endif
|
||||
+___
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
.arch armv8-a+crypto
|
||||
@@ -595,6 +709,384 @@ $code.=<<___;
|
||||
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
|
||||
___
|
||||
}}}
|
||||
+
|
||||
+
|
||||
+{{{
|
||||
+my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5));
|
||||
+my ($blocks)=("x2");
|
||||
+my ($enc)=("x6");
|
||||
+my ($remain)=("x7");
|
||||
+my @twx=map("x$_",(9..24));
|
||||
+my $lastBlk=("x25");
|
||||
+
|
||||
+my @tweak=map("v$_",(8..15));
|
||||
+my @dat=map("v$_",(16..23));
|
||||
+my $lastTweak=("v24");
|
||||
+
|
||||
+# x/w/v/q registers for compute tweak
|
||||
+my ($magic)=("8");
|
||||
+my ($tmp0,$tmp1)=("26","27");
|
||||
+my ($qMagic,$vMagic)=("q25","v25");
|
||||
+my ($vTmp0,$vTmp1)=("v26","v27");
|
||||
+
|
||||
+sub gen_xts_do_cipher() {
|
||||
+$code.=<<___;
|
||||
+.globl ${prefix}_xts_do_cipher${standard}
|
||||
+.type ${prefix}_xts_do_cipher${standard},%function
|
||||
+.align 5
|
||||
+${prefix}_xts_do_cipher${standard}:
|
||||
+ mov w$magic,0x87
|
||||
+ ldr $qMagic, =0x01010101010101010101010101010187
|
||||
+ // used to encrypt the XORed plaintext blocks
|
||||
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64
|
||||
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2]
|
||||
+ ld1 {@tweak[0].4s}, [$ivp]
|
||||
+___
|
||||
+ &rev32(@tweak[0],@tweak[0]);
|
||||
+ &enc_blk(@tweak[0]);
|
||||
+ &rev32(@tweak[0],@tweak[0]);
|
||||
+$code.=<<___;
|
||||
+ // used to encrypt the initial vector to yield the initial tweak
|
||||
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64
|
||||
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1]
|
||||
+
|
||||
+ and $remain,$len,#0x0F
|
||||
+ // convert length into blocks
|
||||
+ lsr $blocks,$len,4
|
||||
+ cmp $blocks,#1 // $len must be at least 16
|
||||
+ b.lt 99f
|
||||
+
|
||||
+ cmp $remain,0 // if $len is a multiple of 16
|
||||
+ b.eq .xts_encrypt_blocks${standard}
|
||||
+ // if $len is not a multiple of 16
|
||||
+ subs $blocks,$blocks,#1
|
||||
+ b.eq .only_2blks_tweak${standard} // if $len is less than 32
|
||||
+
|
||||
+.xts_encrypt_blocks${standard}:
|
||||
+___
|
||||
+ &rbit(@tweak[0],@tweak[0]);
|
||||
+ &rev32_armeb(@tweak[0],@tweak[0]);
|
||||
+ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
|
||||
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic);
|
||||
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic);
|
||||
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic);
|
||||
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic);
|
||||
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic);
|
||||
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic);
|
||||
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic);
|
||||
+$code.=<<___;
|
||||
+1:
|
||||
+ cmp $blocks,#8
|
||||
+___
|
||||
+ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
|
||||
+ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
|
||||
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
|
||||
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
|
||||
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
|
||||
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
|
||||
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
|
||||
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic);
|
||||
+ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
|
||||
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic);
|
||||
+$code.=<<___;
|
||||
+ b.lt 2f
|
||||
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
|
||||
+___
|
||||
+ &rbit(@tweak[0],@tweak[0]);
|
||||
+ &rbit(@tweak[1],@tweak[1]);
|
||||
+ &rbit(@tweak[2],@tweak[2]);
|
||||
+ &rbit(@tweak[3],@tweak[3]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
|
||||
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
|
||||
+ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
|
||||
+___
|
||||
+ &rbit(@tweak[4],@tweak[4]);
|
||||
+ &rbit(@tweak[5],@tweak[5]);
|
||||
+ &rbit(@tweak[6],@tweak[6]);
|
||||
+ &rbit(@tweak[7],@tweak[7]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b
|
||||
+ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b
|
||||
+ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b
|
||||
+ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &rev32(@dat[2],@dat[2]);
|
||||
+ &rev32(@dat[3],@dat[3]);
|
||||
+ &rev32(@dat[4],@dat[4]);
|
||||
+ &rev32(@dat[5],@dat[5]);
|
||||
+ &rev32(@dat[6],@dat[6]);
|
||||
+ &rev32(@dat[7],@dat[7]);
|
||||
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
||||
+ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &rev32(@dat[2],@dat[2]);
|
||||
+ &rev32(@dat[3],@dat[3]);
|
||||
+ &rev32(@dat[4],@dat[4]);
|
||||
+ &rev32(@dat[5],@dat[5]);
|
||||
+ &rev32(@dat[6],@dat[6]);
|
||||
+ &rev32(@dat[7],@dat[7]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
|
||||
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
|
||||
+ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b
|
||||
+ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b
|
||||
+ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b
|
||||
+ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b
|
||||
+
|
||||
+ // save the last tweak
|
||||
+ mov $lastTweak.16b,@tweak[7].16b
|
||||
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
||||
+ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
|
||||
+ subs $blocks,$blocks,#8
|
||||
+ b.eq 100f
|
||||
+ b 1b
|
||||
+2:
|
||||
+ // process 4 blocks
|
||||
+ cmp $blocks,#4
|
||||
+ b.lt 1f
|
||||
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
|
||||
+___
|
||||
+ &rbit(@tweak[0],@tweak[0]);
|
||||
+ &rbit(@tweak[1],@tweak[1]);
|
||||
+ &rbit(@tweak[2],@tweak[2]);
|
||||
+ &rbit(@tweak[3],@tweak[3]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
|
||||
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &rev32(@dat[2],@dat[2]);
|
||||
+ &rev32(@dat[3],@dat[3]);
|
||||
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &rev32(@dat[2],@dat[2]);
|
||||
+ &rev32(@dat[3],@dat[3]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
|
||||
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
|
||||
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
|
||||
+ sub $blocks,$blocks,#4
|
||||
+ mov @tweak[0].16b,@tweak[4].16b
|
||||
+ mov @tweak[1].16b,@tweak[5].16b
|
||||
+ mov @tweak[2].16b,@tweak[6].16b
|
||||
+ // save the last tweak
|
||||
+ mov $lastTweak.16b,@tweak[3].16b
|
||||
+1:
|
||||
+ // process last block
|
||||
+ cmp $blocks,#1
|
||||
+ b.lt 100f
|
||||
+ b.gt 1f
|
||||
+ ld1 {@dat[0].4s},[$inp],#16
|
||||
+___
|
||||
+ &rbit(@tweak[0],@tweak[0]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &enc_blk(@dat[0]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ st1 {@dat[0].4s},[$out],#16
|
||||
+ // save the last tweak
|
||||
+ mov $lastTweak.16b,@tweak[0].16b
|
||||
+ b 100f
|
||||
+1: // process last 2 blocks
|
||||
+ cmp $blocks,#2
|
||||
+ b.gt 1f
|
||||
+ ld1 {@dat[0].4s,@dat[1].4s},[$inp],#32
|
||||
+___
|
||||
+ &rbit(@tweak[0],@tweak[0]);
|
||||
+ &rbit(@tweak[1],@tweak[1]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ st1 {@dat[0].4s,@dat[1].4s},[$out],#32
|
||||
+ // save the last tweak
|
||||
+ mov $lastTweak.16b,@tweak[1].16b
|
||||
+ b 100f
|
||||
+1: // process last 3 blocks
|
||||
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48
|
||||
+___
|
||||
+ &rbit(@tweak[0],@tweak[0]);
|
||||
+ &rbit(@tweak[1],@tweak[1]);
|
||||
+ &rbit(@tweak[2],@tweak[2]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &rev32(@dat[2],@dat[2]);
|
||||
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &rev32(@dat[1],@dat[1]);
|
||||
+ &rev32(@dat[2],@dat[2]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
|
||||
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
|
||||
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
|
||||
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48
|
||||
+ // save the last tweak
|
||||
+ mov $lastTweak.16b,@tweak[2].16b
|
||||
+100:
|
||||
+ cmp $remain,0
|
||||
+ b.eq 99f
|
||||
+
|
||||
+// This brance calculates the last two tweaks,
|
||||
+// while the encryption/decryption length is larger than 32
|
||||
+.last_2blks_tweak${standard}:
|
||||
+___
|
||||
+ &rev32_armeb($lastTweak,$lastTweak);
|
||||
+ &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic);
|
||||
+ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic);
|
||||
+$code.=<<___;
|
||||
+ b .check_dec${standard}
|
||||
+
|
||||
+
|
||||
+// This brance calculates the last two tweaks,
|
||||
+// while the encryption/decryption length is less than 32, who only need two tweaks
|
||||
+.only_2blks_tweak${standard}:
|
||||
+ mov @tweak[1].16b,@tweak[0].16b
|
||||
+___
|
||||
+ &rev32_armeb(@tweak[1],@tweak[1]);
|
||||
+ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic);
|
||||
+$code.=<<___;
|
||||
+ b .check_dec${standard}
|
||||
+
|
||||
+
|
||||
+// Determine whether encryption or decryption is required.
|
||||
+// The last two tweaks need to be swapped for decryption.
|
||||
+.check_dec${standard}:
|
||||
+ // encryption:1 decryption:0
|
||||
+ cmp $enc,1
|
||||
+ b.eq .prcess_last_2blks${standard}
|
||||
+ mov $vTmp0.16B,@tweak[1].16b
|
||||
+ mov @tweak[1].16B,@tweak[2].16b
|
||||
+ mov @tweak[2].16B,$vTmp0.16b
|
||||
+
|
||||
+.prcess_last_2blks${standard}:
|
||||
+___
|
||||
+ &rev32_armeb(@tweak[1],@tweak[1]);
|
||||
+ &rev32_armeb(@tweak[2],@tweak[2]);
|
||||
+$code.=<<___;
|
||||
+ ld1 {@dat[0].4s},[$inp],#16
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &enc_blk(@dat[0]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b
|
||||
+ st1 {@dat[0].4s},[$out],#16
|
||||
+
|
||||
+ sub $lastBlk,$out,16
|
||||
+ .loop${standard}:
|
||||
+ subs $remain,$remain,1
|
||||
+ ldrb w$tmp0,[$lastBlk,$remain]
|
||||
+ ldrb w$tmp1,[$inp,$remain]
|
||||
+ strb w$tmp1,[$lastBlk,$remain]
|
||||
+ strb w$tmp0,[$out,$remain]
|
||||
+ b.gt .loop${standard}
|
||||
+ ld1 {@dat[0].4s}, [$lastBlk]
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b
|
||||
+___
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+ &enc_blk(@dat[0]);
|
||||
+ &rev32(@dat[0],@dat[0]);
|
||||
+$code.=<<___;
|
||||
+ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b
|
||||
+ st1 {@dat[0].4s}, [$lastBlk]
|
||||
+99:
|
||||
+ ret
|
||||
+.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard}
|
||||
+___
|
||||
+} #end of gen_xts_do_cipher
|
||||
+
|
||||
+}}}
|
||||
+
|
||||
+{{{
|
||||
+my ($enc)=("w6");
|
||||
+
|
||||
+sub gen_xts_cipher() {
|
||||
+ my $en = shift;
|
||||
+$code.=<<___;
|
||||
+.globl ${prefix}_xts_${en}crypt${standard}
|
||||
+.type ${prefix}_xts_${en}crypt${standard},%function
|
||||
+.align 5
|
||||
+${prefix}_xts_${en}crypt${standard}:
|
||||
+ stp x15, x16, [sp, #-0x10]!
|
||||
+ stp x17, x18, [sp, #-0x10]!
|
||||
+ stp x19, x20, [sp, #-0x10]!
|
||||
+ stp x21, x22, [sp, #-0x10]!
|
||||
+ stp x23, x24, [sp, #-0x10]!
|
||||
+ stp x25, x26, [sp, #-0x10]!
|
||||
+ stp x27, x28, [sp, #-0x10]!
|
||||
+ stp x29, x30, [sp, #-0x10]!
|
||||
+ stp d8, d9, [sp, #-0x10]!
|
||||
+ stp d10, d11, [sp, #-0x10]!
|
||||
+ stp d12, d13, [sp, #-0x10]!
|
||||
+ stp d14, d15, [sp, #-0x10]!
|
||||
+___
|
||||
+ &mov_en_to_enc($en,$enc);
|
||||
+$code.=<<___;
|
||||
+ bl ${prefix}_xts_do_cipher${standard}
|
||||
+ ldp d14, d15, [sp], #0x10
|
||||
+ ldp d12, d13, [sp], #0x10
|
||||
+ ldp d10, d11, [sp], #0x10
|
||||
+ ldp d8, d9, [sp], #0x10
|
||||
+ ldp x29, x30, [sp], #0x10
|
||||
+ ldp x27, x28, [sp], #0x10
|
||||
+ ldp x25, x26, [sp], #0x10
|
||||
+ ldp x23, x24, [sp], #0x10
|
||||
+ ldp x21, x22, [sp], #0x10
|
||||
+ ldp x19, x20, [sp], #0x10
|
||||
+ ldp x17, x18, [sp], #0x10
|
||||
+ ldp x15, x16, [sp], #0x10
|
||||
+ ret
|
||||
+.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard}
|
||||
+___
|
||||
+
|
||||
+} # end of gen_xts_cipher
|
||||
+$standard="_gb";
|
||||
+&gen_xts_do_cipher();
|
||||
+&gen_xts_cipher("en");
|
||||
+&gen_xts_cipher("de");
|
||||
+$standard="";
|
||||
+&gen_xts_do_cipher();
|
||||
+&gen_xts_cipher("en");
|
||||
+&gen_xts_cipher("de");
|
||||
+}}}
|
||||
########################################
|
||||
{ my %opcode = (
|
||||
"sm4e" => 0xcec08400,
|
||||
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
|
||||
index 2f5a6cf..0bde96f 100644
|
||||
--- a/include/crypto/sm4_platform.h
|
||||
+++ b/include/crypto/sm4_platform.h
|
||||
@@ -26,6 +26,10 @@
|
||||
# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
|
||||
# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
|
||||
# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
|
||||
+# define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb
|
||||
+# define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb
|
||||
+# define HWSM4_xts_encrypt sm4_v8_xts_encrypt
|
||||
+# define HWSM4_xts_decrypt sm4_v8_xts_decrypt
|
||||
# endif
|
||||
# endif
|
||||
# endif /* OPENSSL_CPUID_OBJ */
|
||||
@@ -46,6 +50,16 @@ void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
|
||||
void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
const unsigned char ivec[16]);
|
||||
+/* xts mode in GB/T 17964-2021 */
|
||||
+void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
|
||||
+ const SM4_KEY *key2, const uint8_t iv[16]);
|
||||
+void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
|
||||
+ const SM4_KEY *key2, const uint8_t iv[16]);
|
||||
+/* xts mode in IEEE Std 1619-2007 */
|
||||
+void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
|
||||
+ const SM4_KEY *key2, const uint8_t iv[16]);
|
||||
+void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
|
||||
+ const SM4_KEY *key2, const uint8_t iv[16]);
|
||||
# endif /* HWSM4_CAPABLE */
|
||||
|
||||
#ifdef VPSM4_EX_CAPABLE
|
||||
--
|
||||
2.36.1
|
||||
|
||||
12
openssl.spec
12
openssl.spec
@ -2,7 +2,7 @@
|
||||
Name: openssl
|
||||
Epoch: 1
|
||||
Version: 1.1.1m
|
||||
Release: 11
|
||||
Release: 12
|
||||
Summary: Cryptography and SSL/TLS Toolkit
|
||||
License: OpenSSL and SSLeay
|
||||
URL: https://www.openssl.org/
|
||||
@ -34,7 +34,10 @@ Patch23: CVE-2022-2068-Fix-file-operations-in-c_rehash.patch
|
||||
Patch24: CVE-2022-2097-Fix-AES-OCB-encrypt-decrypt-for-x86-AES-NI.patch
|
||||
Patch25: Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch
|
||||
Patch26: Fix-reported-performance-degradation-on-aarch64.patch
|
||||
Patch27: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch
|
||||
Patch27: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch
|
||||
Patch28: Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
|
||||
Patch29: Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
|
||||
Patch30: Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
|
||||
|
||||
BuildRequires: gcc perl make lksctp-tools-devel coreutils util-linux zlib-devel
|
||||
Requires: coreutils %{name}-libs%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
@ -237,6 +240,11 @@ make test || :
|
||||
%ldconfig_scriptlets libs
|
||||
|
||||
%changelog
|
||||
* Wed Nov 2 2022 Xu Yizhou <xuyizhou1@huawei.com> - 1:1.1.1m-12
|
||||
- SM3 acceleration with SM3 hardware instruction on aarch64
|
||||
- SM4 optimization for ARM by HW instruction
|
||||
- SM4 XTS optimization for ARM by HW instruction
|
||||
|
||||
* Wed Oct 26 2022 luhuaxin <luhuaxin1@huawei.com> - 1:1.1.1m-11
|
||||
- fix cms testcase
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user