SM3/SM4 optimization for ARM by HW instruction

This commit is contained in:
Xu Yizhou 2022-11-02 11:18:01 +08:00
parent 8b90839040
commit 2882adf91a
4 changed files with 2155 additions and 2 deletions

View File

@ -0,0 +1,492 @@
From 4d2e328357ac4b468d4762a5a5f615d7e7bf46a6 Mon Sep 17 00:00:00 2001
From: Xu Yizhou <xuyizhou1@huawei.com>
Date: Thu, 27 Oct 2022 20:49:34 +0800
Subject: [PATCH 1/3] SM3 acceleration with SM3 hardware instruction on aarch64
This patch contains the following two PRs,
1. SM3 acceleration with SM3 hardware instruction on aarch64
SM3 hardware instruction is optional feature of crypto extension for
aarch64. This implementation accelerates SM3 via SM3 instructions. For
the platform not supporting SM3 instruction, the original C
implementation still works. Thanks to AliBaba for testing and reporting
the following perf numbers for Yitian710:
Benchmark on T-Head Yitian-710 2.75GHz:
Before:
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k
After (33% - 74% faster):
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17454)
2. Fix sm3ss1 translation issue in sm3-armv8.pl
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17542)
Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
---
Configurations/00-base-templates.conf | 1 +
Configure | 4 +
crypto/arm64cpuid.pl | 7 +
crypto/arm_arch.h | 1 +
crypto/armcap.c | 10 +
crypto/sm3/asm/sm3-armv8.pl | 280 ++++++++++++++++++++++++++
crypto/sm3/build.info | 15 +-
crypto/sm3/sm3_local.h | 16 +-
8 files changed, 332 insertions(+), 2 deletions(-)
create mode 100644 crypto/sm3/asm/sm3-armv8.pl
diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
index 1d35012..a67ae65 100644
--- a/Configurations/00-base-templates.conf
+++ b/Configurations/00-base-templates.conf
@@ -322,6 +322,7 @@ my %targets=(
poly1305_asm_src=> "poly1305-armv8.S",
keccak1600_asm_src => "keccak1600-armv8.S",
sm4_asm_src => "vpsm4_ex-armv8.S",
+ sm3_asm_src => "sm3-armv8.S",
},
parisc11_asm => {
template => 1,
diff --git a/Configure b/Configure
index 3bfe360..fce460d 100755
--- a/Configure
+++ b/Configure
@@ -1423,6 +1423,9 @@ unless ($disabled{asm}) {
if ($target{sm4_asm_src} ne "") {
push @{$config{lib_defines}}, "VPSM4_EX_ASM";
}
+ if ($target{sm3_asm_src} ne "") {
+ push @{$config{lib_defines}}, "SM3_ASM";
+ }
}
my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC});
@@ -3379,6 +3382,7 @@ sub print_table_entry
"multilib",
"build_scheme",
"sm4_asm_src",
+ "sm3_asm_src",
);
if ($type eq "TABLE") {
diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
index 319927e..1e9b167 100755
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@@ -78,6 +78,13 @@ _armv8_sha512_probe:
ret
.size _armv8_sha512_probe,.-_armv8_sha512_probe
+.globl _armv8_sm3_probe
+.type _armv8_sm3_probe,%function
+_armv8_sm3_probe:
+ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s
+ ret
+.size _armv8_sm3_probe,.-_armv8_sm3_probe
+
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,%function
.align 5
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index 8b71055..8839b21 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -80,5 +80,6 @@ extern unsigned int OPENSSL_armcap_P;
# define ARMV8_SHA256 (1<<4)
# define ARMV8_PMULL (1<<5)
# define ARMV8_SHA512 (1<<6)
+# define ARMV8_SM3 (1<<9)
#endif
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 48c5d4d..8b2f4a5 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -47,6 +47,7 @@ void _armv8_sha1_probe(void);
void _armv8_sha256_probe(void);
void _armv8_pmull_probe(void);
# ifdef __aarch64__
+void _armv8_sm3_probe(void);
void _armv8_sha512_probe(void);
# endif
uint32_t _armv7_tick(void);
@@ -130,6 +131,7 @@ static unsigned long getauxval(unsigned long key)
# define HWCAP_CE_PMULL (1 << 4)
# define HWCAP_CE_SHA1 (1 << 5)
# define HWCAP_CE_SHA256 (1 << 6)
+# define HWCAP_CE_SM3 (1 << 18)
# define HWCAP_CE_SHA512 (1 << 21)
# endif
@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void)
# ifdef __aarch64__
if (hwcap & HWCAP_CE_SHA512)
OPENSSL_armcap_P |= ARMV8_SHA512;
+
+ if (hwcap & HWCAP_CE_SM3)
+ OPENSSL_armcap_P |= ARMV8_SM3;
# endif
}
# endif
@@ -233,6 +238,11 @@ void OPENSSL_cpuid_setup(void)
_armv8_sha512_probe();
OPENSSL_armcap_P |= ARMV8_SHA512;
}
+
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sm3_probe();
+ OPENSSL_armcap_P |= ARMV8_SM3;
+ }
# endif
}
# endif
diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
new file mode 100644
index 0000000..677ca52
--- /dev/null
+++ b/crypto/sm3/asm/sm3-armv8.pl
@@ -0,0 +1,280 @@
+#! /usr/bin/env perl
+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# This module implements support for Armv8 SM3 instructions
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+# Message expanding:
+# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
+# Input: s0, s1, s2, s3
+# s0 = w0 | w1 | w2 | w3
+# s1 = w4 | w5 | w6 | w7
+# s2 = w8 | w9 | w10 | w11
+# s3 = w12 | w13 | w14 | w15
+# Output: s4
+sub msg_exp () {
+my $s0 = shift;
+my $s1 = shift;
+my $s2 = shift;
+my $s3 = shift;
+my $s4 = shift;
+my $vtmp1 = shift;
+my $vtmp2 = shift;
+$code.=<<___;
+ // s4 = w7 | w8 | w9 | w10
+ ext $s4.16b, $s1.16b, $s2.16b, #12
+ // vtmp1 = w3 | w4 | w5 | w6
+ ext $vtmp1.16b, $s0.16b, $s1.16b, #12
+ // vtmp2 = w10 | w11 | w12 | w13
+ ext $vtmp2.16b, $s2.16b, $s3.16b, #8
+ sm3partw1 $s4.4s, $s0.4s, $s3.4s
+ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s
+___
+}
+
+# A round of compresson function
+# Input:
+# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
+# vstate0 - vstate1, store digest status(A - H)
+# vconst0 - vconst1, interleaved used to store Tj <<< j
+# vtmp - temporary register
+# vw - for sm3tt1ab, vw = s0 eor s1
+# s0 - for sm3tt2ab, just be s0
+# i, choose wj' or wj from vw
+sub round () {
+my $ab = shift;
+my $vstate0 = shift;
+my $vstate1 = shift;
+my $vconst0 = shift;
+my $vconst1 = shift;
+my $vtmp = shift;
+my $vw = shift;
+my $s0 = shift;
+my $i = shift;
+$code.=<<___;
+ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
+ shl $vconst1.4s, $vconst0.4s, #1
+ sri $vconst1.4s, $vconst0.4s, #31
+ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i]
+ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i]
+___
+}
+
+sub qround () {
+my $ab = shift;
+my $vstate0 = shift;
+my $vstate1 = shift;
+my $vconst0 = shift;
+my $vconst1 = shift;
+my $vtmp1 = shift;
+my $vtmp2 = shift;
+my $s0 = shift;
+my $s1 = shift;
+my $s2 = shift;
+my $s3 = shift;
+my $s4 = shift;
+ if($s4) {
+ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
+ }
+$code.=<<___;
+ eor $vtmp1.16b, $s0.16b, $s1.16b
+___
+ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
+ $vtmp1, $s0, 0);
+ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
+ $vtmp1, $s0, 1);
+ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
+ $vtmp1, $s0, 2);
+ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
+ $vtmp1, $s0, 3);
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch armv8.2-a
+.text
+___
+
+{{{
+my ($pstate,$pdata,$num)=("x0","x1","w2");
+my ($state1,$state2)=("v5","v6");
+my ($sconst1, $sconst2)=("s16","s17");
+my ($vconst1, $vconst2)=("v16","v17");
+my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
+my ($bkstate1,$bkstate2)=("v18","v19");
+my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
+my ($vtmp1,$vtmp2)=("v22","v23");
+my $constaddr="x8";
+# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
+$code.=<<___;
+.globl ossl_hwsm3_block_data_order
+.type ossl_hwsm3_block_data_order,%function
+.align 5
+ossl_hwsm3_block_data_order:
+ // load state
+ ld1 {$state1.4s-$state2.4s}, [$pstate]
+ rev64 $state1.4s, $state1.4s
+ rev64 $state2.4s, $state2.4s
+ ext $state1.16b, $state1.16b, $state1.16b, #8
+ ext $state2.16b, $state2.16b, $state2.16b, #8
+
+ adr $constaddr, .Tj
+ ldp $sconst1, $sconst2, [$constaddr]
+
+.Loop:
+ // load input
+ ld1 {$s0.16b-$s3.16b}, [$pdata], #64
+ sub $num, $num, #1
+
+ mov $bkstate1.16b, $state1.16b
+ mov $bkstate2.16b, $state2.16b
+
+#ifndef __ARMEB__
+ rev32 $s0.16b, $s0.16b
+ rev32 $s1.16b, $s1.16b
+ rev32 $s2.16b, $s2.16b
+ rev32 $s3.16b, $s3.16b
+#endif
+
+ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
+___
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s0,$s1,$s2,$s3,$s4);
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s1,$s2,$s3,$s4,$s0);
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s2,$s3,$s4,$s0,$s1);
+ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s3,$s4,$s0,$s1,$s2);
+
+$code.=<<___;
+ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
+___
+
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s4,$s0,$s1,$s2,$s3);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s0,$s1,$s2,$s3,$s4);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s1,$s2,$s3,$s4,$s0);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s2,$s3,$s4,$s0,$s1);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s3,$s4,$s0,$s1,$s2);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s4,$s0,$s1,$s2,$s3);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s0,$s1,$s2,$s3,$s4);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s1,$s2,$s3,$s4,$s0);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s2,$s3,$s4,$s0,$s1);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s3,$s4);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s4,$s0);
+ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+ $s0,$s1);
+
+$code.=<<___;
+ eor $state1.16b, $state1.16b, $bkstate1.16b
+ eor $state2.16b, $state2.16b, $bkstate2.16b
+
+ // any remained blocks?
+ cbnz $num, .Loop
+
+ // save state
+ rev64 $state1.4s, $state1.4s
+ rev64 $state2.4s, $state2.4s
+ ext $state1.16b, $state1.16b, $state1.16b, #8
+ ext $state2.16b, $state2.16b, $state2.16b, #8
+ st1 {$state1.4s-$state2.4s}, [$pstate]
+ ret
+.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
+
+.align 3
+.Tj:
+.word 0x79cc4519, 0x9d8a7a87
+___
+}}}
+
+#########################################
+my %sm3partopcode = (
+ "sm3partw1" => 0xce60C000,
+ "sm3partw2" => 0xce60C400);
+
+my %sm3ss1opcode = (
+ "sm3ss1" => 0xce400000);
+
+my %sm3ttopcode = (
+ "sm3tt1a" => 0xce408000,
+ "sm3tt1b" => 0xce408400,
+ "sm3tt2a" => 0xce408800,
+ "sm3tt2b" => 0xce408C00);
+
+sub unsm3part {
+ my ($mnemonic,$arg)=@_;
+
+ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+}
+
+sub unsm3ss1 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
+ $mnemonic,$arg;
+}
+
+sub unsm3tt {
+ my ($mnemonic,$arg)=@_;
+
+ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
+ $mnemonic,$arg;
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
+ s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
+ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
+ print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
index 6009b19..e113729 100644
--- a/crypto/sm3/build.info
+++ b/crypto/sm3/build.info
@@ -1,2 +1,15 @@
LIBS=../../libcrypto
-SOURCE[../../libcrypto]=sm3.c m_sm3.c
+SOURCE[../../libcrypto]=\
+ sm3.c m_sm3.c {- $target{sm3_asm_src} -}
+
+GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME)
+INCLUDE[sm3-armv8.o]=..
+
+BEGINRAW[Makefile]
+##### SM3 assembler implementations
+
+# GNU make "catch all"
+{- $builddir -}/sm3-%.S: {- $sourcedir -}/asm/sm3-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+
+ENDRAW[Makefile]
\ No newline at end of file
diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
index 7171de5..aafff63 100644
--- a/crypto/sm3/sm3_local.h
+++ b/crypto/sm3/sm3_local.h
@@ -32,7 +32,21 @@
ll=(c)->G; (void)HOST_l2c(ll, (s)); \
ll=(c)->H; (void)HOST_l2c(ll, (s)); \
} while (0)
-#define HASH_BLOCK_DATA_ORDER sm3_block_data_order
+
+#if defined(SM3_ASM)
+# if defined(__aarch64__)
+# include "crypto/arm_arch.h"
+# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
+void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+# endif
+#endif
+
+#if defined(HWSM3_CAPABLE)
+# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
+ : sm3_block_data_order)
+#else
+# define HASH_BLOCK_DATA_ORDER sm3_block_data_order
+#endif
void sm3_transform(SM3_CTX *c, const unsigned char *data);
--
2.36.1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,621 @@
From 3f0898b2aea424f18f58a182803478f25548674e Mon Sep 17 00:00:00 2001
From: Xu Yizhou <xuyizhou1@huawei.com>
Date: Wed, 2 Nov 2022 11:13:07 +0800
Subject: [PATCH 3/3] SM4 XTS optimization for ARM by HW instruction
This patch implements the SM4 XTS optimization for ARM processor,
using SM4 HW instruction, which is an optional feature of
crypto extension for aarch64 V8.
Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
---
crypto/evp/e_sm4.c | 28 ++
crypto/sm4/asm/sm4-armv8.pl | 498 +++++++++++++++++++++++++++++++++-
include/crypto/sm4_platform.h | 14 +
3 files changed, 537 insertions(+), 3 deletions(-)
diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
index eaa5ba0..da4dbd3 100644
--- a/crypto/evp/e_sm4.c
+++ b/crypto/evp/e_sm4.c
@@ -281,6 +281,34 @@ static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2;
xctx->stream_gb = NULL;
xctx->stream = NULL;
+#ifdef HWSM4_CAPABLE
+ if (HWSM4_CAPABLE) {
+ if (enc) {
+ HWSM4_set_encrypt_key(key, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) HWSM4_encrypt;
+# ifdef HWSM4_xts_encrypt_gb
+ xctx->stream_gb = HWSM4_xts_encrypt_gb;
+# endif
+# ifdef HWSM4_xts_encrypt
+ xctx->stream = HWSM4_xts_encrypt;
+# endif
+ } else {
+ HWSM4_set_decrypt_key(key, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) HWSM4_decrypt;
+# ifdef HWSM4_xts_decrypt_gb
+ xctx->stream_gb = HWSM4_xts_decrypt_gb;
+# endif
+# ifdef HWSM4_xts_decrypt
+ xctx->stream = HWSM4_xts_decrypt;
+# endif
+ }
+ HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) HWSM4_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ } else
+#endif
#ifdef VPSM4_EX_CAPABLE
if (VPSM4_EX_CAPABLE) {
if (enc) {
diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
index dbacad2..923c1c0 100644
--- a/crypto/sm4/asm/sm4-armv8.pl
+++ b/crypto/sm4/asm/sm4-armv8.pl
@@ -11,9 +11,9 @@
# Oct 2021
#
-# $output is the last argument if it looks like a file (it has an extension)
+# $outut is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
-$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@@ -21,7 +21,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+open OUT,"| \"$^X\" $xlate $flavour \"$outut\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
@@ -110,6 +110,120 @@ $code.=<<___;
___
}
+sub mov_reg_to_vec() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $desv = shift;
+$code.=<<___;
+ mov $desv.d[0],$src0
+ mov $desv.d[1],$src1
+#ifdef __ARMEB__
+ rev32 $desv.16b,$desv.16b
+#endif
+___
+}
+
+sub mov_vec_to_reg() {
+ my $srcv = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $des0,$srcv.d[0]
+ mov $des1,$srcv.d[1]
+___
+}
+
+sub compute_tweak() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+ my $tmp0 = shift;
+ my $tmp1 = shift;
+ my $magic = shift;
+$code.=<<___;
+ extr x$tmp1,$src1,$src1,#32
+ extr $des1,$src1,$src0,#63
+ and w$tmp0,w$magic,w$tmp1,asr#31
+ eor $des0,x$tmp0,$src0,lsl#1
+___
+}
+
+sub compute_tweak_vec() {
+ my $src = shift;
+ my $des = shift;
+ my $tmp0 = shift;
+ my $tmp1 = shift;
+ my $magic = shift;
+ &rbit($tmp1,$src);
+$code.=<<___;
+ shl $des.16b, $tmp1.16b, #1
+ ext $tmp0.16b, $tmp1.16b, $tmp1.16b,#15
+ ushr $tmp0.16b, $tmp0.16b, #7
+ mul $tmp0.16b, $tmp0.16b, $magic.16b
+ eor $des.16b, $des.16b, $tmp0.16b
+___
+ &rbit($des,$des);
+}
+
+sub mov_en_to_enc(){
+ my $en = shift;
+ my $enc = shift;
+ if ($en eq "en") {
+$code.=<<___;
+ mov $enc,1
+___
+ } else {
+$code.=<<___;
+ mov $enc,0
+___
+ }
+}
+
+sub rbit() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+ if ($standard eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ } else {
+$code.=<<___;
+ mov $dst.16b,$src.16b
+___
+ }
+ } else {
+ if ($standard eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ }
+ }
+}
+
+sub rev32_armeb() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifdef __ARMEB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifdef __ARMEB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
$code=<<___;
#include "arm_arch.h"
.arch armv8-a+crypto
@@ -595,6 +709,384 @@ $code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
+
+
+{{{
+my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5));
+my ($blocks)=("x2");
+my ($enc)=("x6");
+my ($remain)=("x7");
+my @twx=map("x$_",(9..24));
+my $lastBlk=("x25");
+
+my @tweak=map("v$_",(8..15));
+my @dat=map("v$_",(16..23));
+my $lastTweak=("v24");
+
+# x/w/v/q registers for compute tweak
+my ($magic)=("8");
+my ($tmp0,$tmp1)=("26","27");
+my ($qMagic,$vMagic)=("q25","v25");
+my ($vTmp0,$vTmp1)=("v26","v27");
+
+sub gen_xts_do_cipher() {
+$code.=<<___;
+.globl ${prefix}_xts_do_cipher${standard}
+.type ${prefix}_xts_do_cipher${standard},%function
+.align 5
+${prefix}_xts_do_cipher${standard}:
+ mov w$magic,0x87
+ ldr $qMagic, =0x01010101010101010101010101010187
+ // used to encrypt the XORed plaintext blocks
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2]
+ ld1 {@tweak[0].4s}, [$ivp]
+___
+ &rev32(@tweak[0],@tweak[0]);
+ &enc_blk(@tweak[0]);
+ &rev32(@tweak[0],@tweak[0]);
+$code.=<<___;
+ // used to encrypt the initial vector to yield the initial tweak
+ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64
+ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1]
+
+ and $remain,$len,#0x0F
+ // convert length into blocks
+ lsr $blocks,$len,4
+ cmp $blocks,#1 // $len must be at least 16
+ b.lt 99f
+
+ cmp $remain,0 // if $len is a multiple of 16
+ b.eq .xts_encrypt_blocks${standard}
+ // if $len is not a multiple of 16
+ subs $blocks,$blocks,#1
+ b.eq .only_2blks_tweak${standard} // if $len is less than 32
+
+.xts_encrypt_blocks${standard}:
+___
+ &rbit(@tweak[0],@tweak[0]);
+ &rev32_armeb(@tweak[0],@tweak[0]);
+ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic);
+$code.=<<___;
+1:
+ cmp $blocks,#8
+___
+ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
+ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic);
+ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic);
+$code.=<<___;
+ b.lt 2f
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+___
+ &rbit(@tweak[0],@tweak[0]);
+ &rbit(@tweak[1],@tweak[1]);
+ &rbit(@tweak[2],@tweak[2]);
+ &rbit(@tweak[3],@tweak[3]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
+___
+ &rbit(@tweak[4],@tweak[4]);
+ &rbit(@tweak[5],@tweak[5]);
+ &rbit(@tweak[6],@tweak[6]);
+ &rbit(@tweak[7],@tweak[7]);
+$code.=<<___;
+ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b
+ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b
+ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b
+ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],@dat[7]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &rev32(@dat[4],@dat[4]);
+ &rev32(@dat[5],@dat[5]);
+ &rev32(@dat[6],@dat[6]);
+ &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b
+ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b
+ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b
+ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b
+
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[7].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+ subs $blocks,$blocks,#8
+ b.eq 100f
+ b 1b
+2:
+ // process 4 blocks
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+___
+ &rbit(@tweak[0],@tweak[0]);
+ &rbit(@tweak[1],@tweak[1]);
+ &rbit(@tweak[2],@tweak[2]);
+ &rbit(@tweak[3],@tweak[3]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+ sub $blocks,$blocks,#4
+ mov @tweak[0].16b,@tweak[4].16b
+ mov @tweak[1].16b,@tweak[5].16b
+ mov @tweak[2].16b,@tweak[6].16b
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[3].16b
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@dat[0].4s},[$inp],#16
+___
+ &rbit(@tweak[0],@tweak[0]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &enc_blk(@dat[0]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ st1 {@dat[0].4s},[$out],#16
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[0].16b
+ b 100f
+1: // process last 2 blocks
+ cmp $blocks,#2
+ b.gt 1f
+ ld1 {@dat[0].4s,@dat[1].4s},[$inp],#32
+___
+ &rbit(@tweak[0],@tweak[0]);
+ &rbit(@tweak[1],@tweak[1]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ st1 {@dat[0].4s,@dat[1].4s},[$out],#32
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[1].16b
+ b 100f
+1: // process last 3 blocks
+ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48
+___
+ &rbit(@tweak[0],@tweak[0]);
+ &rbit(@tweak[1],@tweak[1]);
+ &rbit(@tweak[2],@tweak[2]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+ &rev32(@dat[0],@dat[0]);
+ &rev32(@dat[1],@dat[1]);
+ &rev32(@dat[2],@dat[2]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[2].16b
+100:
+ cmp $remain,0
+ b.eq 99f
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak${standard}:
+___
+ &rev32_armeb($lastTweak,$lastTweak);
+ &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic);
+ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic);
+$code.=<<___;
+ b .check_dec${standard}
+
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is less than 32, who only need two tweaks
+.only_2blks_tweak${standard}:
+ mov @tweak[1].16b,@tweak[0].16b
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic);
+$code.=<<___;
+ b .check_dec${standard}
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec${standard}:
+ // encryption:1 decryption:0
+ cmp $enc,1
+ b.eq .prcess_last_2blks${standard}
+ mov $vTmp0.16B,@tweak[1].16b
+ mov @tweak[1].16B,@tweak[2].16b
+ mov @tweak[2].16B,$vTmp0.16b
+
+.prcess_last_2blks${standard}:
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &rev32_armeb(@tweak[2],@tweak[2]);
+$code.=<<___;
+ ld1 {@dat[0].4s},[$inp],#16
+ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &enc_blk(@dat[0]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b
+ st1 {@dat[0].4s},[$out],#16
+
+ sub $lastBlk,$out,16
+ .loop${standard}:
+ subs $remain,$remain,1
+ ldrb w$tmp0,[$lastBlk,$remain]
+ ldrb w$tmp1,[$inp,$remain]
+ strb w$tmp1,[$lastBlk,$remain]
+ strb w$tmp0,[$out,$remain]
+ b.gt .loop${standard}
+ ld1 {@dat[0].4s}, [$lastBlk]
+ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b
+___
+ &rev32(@dat[0],@dat[0]);
+ &enc_blk(@dat[0]);
+ &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b
+ st1 {@dat[0].4s}, [$lastBlk]
+99:
+ ret
+.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard}
+___
+} #end of gen_xts_do_cipher
+
+}}}
+
+{{{
+my ($enc)=("w6");
+
+sub gen_xts_cipher() {
+ my $en = shift;
+$code.=<<___;
+.globl ${prefix}_xts_${en}crypt${standard}
+.type ${prefix}_xts_${en}crypt${standard},%function
+.align 5
+${prefix}_xts_${en}crypt${standard}:
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+___
+ &mov_en_to_enc($en,$enc);
+$code.=<<___;
+ bl ${prefix}_xts_do_cipher${standard}
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ ret
+.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard}
+___
+
+} # end of gen_xts_cipher
+$standard="_gb";
+&gen_xts_do_cipher();
+&gen_xts_cipher("en");
+&gen_xts_cipher("de");
+$standard="";
+&gen_xts_do_cipher();
+&gen_xts_cipher("en");
+&gen_xts_cipher("de");
+}}}
########################################
{ my %opcode = (
"sm4e" => 0xcec08400,
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
index 2f5a6cf..0bde96f 100644
--- a/include/crypto/sm4_platform.h
+++ b/include/crypto/sm4_platform.h
@@ -26,6 +26,10 @@
# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
+# define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb
+# define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb
+# define HWSM4_xts_encrypt sm4_v8_xts_encrypt
+# define HWSM4_xts_decrypt sm4_v8_xts_decrypt
# endif
# endif
# endif /* OPENSSL_CPUID_OBJ */
@@ -46,6 +50,16 @@ void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
const unsigned char ivec[16]);
+/* xts mode in GB/T 17964-2021 */
+void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+ const SM4_KEY *key2, const uint8_t iv[16]);
+void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+ const SM4_KEY *key2, const uint8_t iv[16]);
+/* xts mode in IEEE Std 1619-2007 */
+void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+ const SM4_KEY *key2, const uint8_t iv[16]);
+void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+ const SM4_KEY *key2, const uint8_t iv[16]);
# endif /* HWSM4_CAPABLE */
#ifdef VPSM4_EX_CAPABLE
--
2.36.1

View File

@ -2,7 +2,7 @@
Name: openssl
Epoch: 1
Version: 1.1.1m
Release: 11
Release: 12
Summary: Cryptography and SSL/TLS Toolkit
License: OpenSSL and SSLeay
URL: https://www.openssl.org/
@ -34,7 +34,10 @@ Patch23: CVE-2022-2068-Fix-file-operations-in-c_rehash.patch
Patch24: CVE-2022-2097-Fix-AES-OCB-encrypt-decrypt-for-x86-AES-NI.patch
Patch25: Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch
Patch26: Fix-reported-performance-degradation-on-aarch64.patch
Patch27: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch
Patch27: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch
Patch28: Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
Patch29: Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
Patch30: Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
BuildRequires: gcc perl make lksctp-tools-devel coreutils util-linux zlib-devel
Requires: coreutils %{name}-libs%{?_isa} = %{epoch}:%{version}-%{release}
@ -237,6 +240,11 @@ make test || :
%ldconfig_scriptlets libs
%changelog
* Wed Nov 2 2022 Xu Yizhou <xuyizhou1@huawei.com> - 1:1.1.1m-12
- SM3 acceleration with SM3 hardware instruction on aarch64
- SM4 optimization for ARM by HW instruction
- SM4 XTS optimization for ARM by HW instruction
* Wed Oct 26 2022 luhuaxin <luhuaxin1@huawei.com> - 1:1.1.1m-11
- fix cms testcase