!163 SM3/SM4 optimization for ARM by HW instruction

From: @xu-yi-zhou Reviewed-by: @zhujianwei001 Signed-off-by: @zhujianwei001
2022-11-02 07:08:27 +00:00 · 2022-11-02 07:08:27 +00:00 · 4c696367a7
commit 4c696367a7
parent 8b90839040 2882adf91a
4 changed files with 2155 additions and 2 deletions
--- a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
+++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
@ -0,0 +1,492 @@
+From 4d2e328357ac4b468d4762a5a5f615d7e7bf46a6 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Thu, 27 Oct 2022 20:49:34 +0800
+Subject: [PATCH 1/3] SM3 acceleration with SM3 hardware instruction on aarch64
+
+This patch contains the following two PRs,
+
+1. SM3 acceleration with SM3 hardware instruction on aarch64
+
+SM3 hardware instruction is optional feature of crypto extension for
+aarch64. This implementation accelerates SM3 via SM3 instructions. For
+the platform not supporting SM3 instruction, the original C
+implementation still works. Thanks to AliBaba for testing and reporting
+the following perf numbers for Yitian710:
+
+Benchmark on T-Head Yitian-710 2.75GHz:
+
+Before:
+type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
+sm3   49297.82k   121062.63k   223106.05k   283371.52k   307574.10k   309400.92k
+
+After (33% - 74% faster):
+type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
+sm3   65640.01k   179121.79k   359854.59k   481448.96k   534055.59k   538274.47k
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17454)
+
+2. Fix sm3ss1 translation issue in sm3-armv8.pl
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17542)
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+---
+ Configurations/00-base-templates.conf |   1 +
+ Configure                             |   4 +
+ crypto/arm64cpuid.pl                  |   7 +
+ crypto/arm_arch.h                     |   1 +
+ crypto/armcap.c                       |  10 +
+ crypto/sm3/asm/sm3-armv8.pl           | 280 ++++++++++++++++++++++++++
+ crypto/sm3/build.info                 |  15 +-
+ crypto/sm3/sm3_local.h                |  16 +-
+ 8 files changed, 332 insertions(+), 2 deletions(-)
+ create mode 100644 crypto/sm3/asm/sm3-armv8.pl
+
+diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
+index 1d35012..a67ae65 100644
+--- a/Configurations/00-base-templates.conf
+++ b/Configurations/00-base-templates.conf
+@@ -322,6 +322,7 @@ my %targets=(
+ 	poly1305_asm_src=> "poly1305-armv8.S",
+ 	keccak1600_asm_src	=> "keccak1600-armv8.S",
+ 	sm4_asm_src     =>  "vpsm4_ex-armv8.S",
+	sm3_asm_src     =>  "sm3-armv8.S",
+     },
+     parisc11_asm => {
+ 	template	=> 1,
+diff --git a/Configure b/Configure
+index 3bfe360..fce460d 100755
+--- a/Configure
+++ b/Configure
+@@ -1423,6 +1423,9 @@ unless ($disabled{asm}) {
+     if ($target{sm4_asm_src} ne "") {
+         push @{$config{lib_defines}}, "VPSM4_EX_ASM";
+     }
+    if ($target{sm3_asm_src} ne "") {
+        push @{$config{lib_defines}}, "SM3_ASM";
+    }
+ }
+ 
+ my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC});
+@@ -3379,6 +3382,7 @@ sub print_table_entry
+         "multilib",
+         "build_scheme",
+         "sm4_asm_src",
+        "sm3_asm_src",
+         );
+ 
+     if ($type eq "TABLE") {
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 319927e..1e9b167 100755
+--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
+@@ -78,6 +78,13 @@ _armv8_sha512_probe:
+ 	ret
+ .size	_armv8_sha512_probe,.-_armv8_sha512_probe
+ 
+.globl	_armv8_sm3_probe
+.type	_armv8_sm3_probe,%function
+_armv8_sm3_probe:
+	.long	0xce63c004	// sm3partw1 v4.4s, v0.4s, v3.4s
+	ret
+.size	_armv8_sm3_probe,.-_armv8_sm3_probe
+
+ .globl	OPENSSL_cleanse
+ .type	OPENSSL_cleanse,%function
+ .align	5
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index 8b71055..8839b21 100644
+--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
+@@ -80,5 +80,6 @@ extern unsigned int OPENSSL_armcap_P;
+ # define ARMV8_SHA256    (1<<4)
+ # define ARMV8_PMULL     (1<<5)
+ # define ARMV8_SHA512    (1<<6)
+# define ARMV8_SM3       (1<<9)
+ 
+ #endif
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index 48c5d4d..8b2f4a5 100644
+--- a/crypto/armcap.c
+++ b/crypto/armcap.c
+@@ -47,6 +47,7 @@ void _armv8_sha1_probe(void);
+ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
+void _armv8_sm3_probe(void);
+ void _armv8_sha512_probe(void);
+ # endif
+ uint32_t _armv7_tick(void);
+@@ -130,6 +131,7 @@ static unsigned long getauxval(unsigned long key)
+ #  define HWCAP_CE_PMULL         (1 << 4)
+ #  define HWCAP_CE_SHA1          (1 << 5)
+ #  define HWCAP_CE_SHA256        (1 << 6)
+#  define HWCAP_CE_SM3           (1 << 18)
+ #  define HWCAP_CE_SHA512        (1 << 21)
+ # endif
+ 
+@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void)
+ #  ifdef __aarch64__
+         if (hwcap & HWCAP_CE_SHA512)
+             OPENSSL_armcap_P |= ARMV8_SHA512;
+
+        if (hwcap & HWCAP_CE_SM3)
+            OPENSSL_armcap_P |= ARMV8_SM3;
+ #  endif
+     }
+ # endif
+@@ -233,6 +238,11 @@ void OPENSSL_cpuid_setup(void)
+             _armv8_sha512_probe();
+             OPENSSL_armcap_P |= ARMV8_SHA512;
+         }
+
+        if (sigsetjmp(ill_jmp, 1) == 0) {
+            _armv8_sm3_probe();
+            OPENSSL_armcap_P |= ARMV8_SM3;
+        }
+ #  endif
+     }
+ # endif
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+new file mode 100644
+index 0000000..677ca52
+--- /dev/null
+++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -0,0 +1,280 @@
+#! /usr/bin/env perl
+# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# This module implements support for Armv8 SM3 instructions
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+# Message expanding:
+#	Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
+# Input: s0, s1, s2, s3
+#	s0 = w0  | w1  | w2  | w3
+#	s1 = w4  | w5  | w6  | w7
+#	s2 = w8  | w9  | w10 | w11
+#	s3 = w12 | w13 | w14 | w15
+# Output: s4
+sub msg_exp () {
+my $s0 = shift;
+my $s1 = shift;
+my $s2 = shift;
+my $s3 = shift;
+my $s4 = shift;
+my $vtmp1 = shift;
+my $vtmp2 = shift;
+$code.=<<___;
+	// s4 = w7  | w8  | w9  | w10
+	ext     $s4.16b, $s1.16b, $s2.16b, #12
+	// vtmp1 = w3  | w4  | w5  | w6
+	ext	$vtmp1.16b, $s0.16b, $s1.16b, #12
+	// vtmp2 = w10 | w11 | w12 | w13
+	ext     $vtmp2.16b, $s2.16b, $s3.16b, #8
+	sm3partw1       $s4.4s, $s0.4s, $s3.4s
+	sm3partw2       $s4.4s, $vtmp2.4s, $vtmp1.4s
+___
+}
+
+# A round of compresson function
+# Input:
+# 	ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
+# 	vstate0 - vstate1, store digest status(A - H)
+# 	vconst0 - vconst1, interleaved used to store Tj <<< j
+# 	vtmp - temporary register
+# 	vw - for sm3tt1ab, vw = s0 eor s1
+# 	s0 - for sm3tt2ab, just be s0
+# 	i, choose wj' or wj from vw
+sub round () {
+my $ab = shift;
+my $vstate0 = shift;
+my $vstate1 = shift;
+my $vconst0 = shift;
+my $vconst1 = shift;
+my $vtmp = shift;
+my $vw = shift;
+my $s0 = shift;
+my $i = shift;
+$code.=<<___;
+	sm3ss1  $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
+	shl     $vconst1.4s, $vconst0.4s, #1
+	sri     $vconst1.4s, $vconst0.4s, #31
+	sm3tt1$ab       $vstate0.4s, $vtmp.4s, $vw.4s[$i]
+	sm3tt2$ab       $vstate1.4s, $vtmp.4s, $s0.4s[$i]
+___
+}
+
+sub qround () {
+my $ab = shift;
+my $vstate0 = shift;
+my $vstate1 = shift;
+my $vconst0 = shift;
+my $vconst1 = shift;
+my $vtmp1 = shift;
+my $vtmp2 = shift;
+my $s0 = shift;
+my $s1 = shift;
+my $s2 = shift;
+my $s3 = shift;
+my $s4 = shift;
+	if($s4) {
+		&msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
+	}
+$code.=<<___;
+	eor     $vtmp1.16b, $s0.16b, $s1.16b
+___
+	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
+               $vtmp1, $s0, 0);
+	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
+               $vtmp1, $s0, 1);
+	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
+               $vtmp1, $s0, 2);
+	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
+               $vtmp1, $s0, 3);
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch	armv8.2-a
+.text
+___
+
+{{{
+my ($pstate,$pdata,$num)=("x0","x1","w2");
+my ($state1,$state2)=("v5","v6");
+my ($sconst1, $sconst2)=("s16","s17");
+my ($vconst1, $vconst2)=("v16","v17");
+my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
+my ($bkstate1,$bkstate2)=("v18","v19");
+my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
+my ($vtmp1,$vtmp2)=("v22","v23");
+my $constaddr="x8";
+# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
+$code.=<<___;
+.globl	ossl_hwsm3_block_data_order
+.type	ossl_hwsm3_block_data_order,%function
+.align	5
+ossl_hwsm3_block_data_order:
+	// load state
+	ld1     {$state1.4s-$state2.4s}, [$pstate]
+	rev64   $state1.4s, $state1.4s
+	rev64   $state2.4s, $state2.4s
+	ext     $state1.16b, $state1.16b, $state1.16b, #8
+	ext     $state2.16b, $state2.16b, $state2.16b, #8
+
+	adr     $constaddr, .Tj
+	ldp     $sconst1, $sconst2, [$constaddr]
+
+.Loop:
+	// load input
+	ld1     {$s0.16b-$s3.16b}, [$pdata], #64
+	sub     $num, $num, #1
+
+	mov     $bkstate1.16b, $state1.16b
+	mov     $bkstate2.16b, $state2.16b
+
+#ifndef __ARMEB__
+	rev32   $s0.16b, $s0.16b
+	rev32   $s1.16b, $s1.16b
+	rev32   $s2.16b, $s2.16b
+	rev32   $s3.16b, $s3.16b
+#endif
+
+	ext     $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
+___
+	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1,$s2,$s3,$s4);
+	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s1,$s2,$s3,$s4,$s0);
+	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s2,$s3,$s4,$s0,$s1);
+	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s3,$s4,$s0,$s1,$s2);
+
+$code.=<<___;
+	ext     $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
+___
+
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s4,$s0,$s1,$s2,$s3);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1,$s2,$s3,$s4);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s1,$s2,$s3,$s4,$s0);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s2,$s3,$s4,$s0,$s1);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s3,$s4,$s0,$s1,$s2);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s4,$s0,$s1,$s2,$s3);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1,$s2,$s3,$s4);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s1,$s2,$s3,$s4,$s0);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s2,$s3,$s4,$s0,$s1);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s3,$s4);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s4,$s0);
+	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1);
+
+$code.=<<___;
+	eor     $state1.16b, $state1.16b, $bkstate1.16b
+	eor     $state2.16b, $state2.16b, $bkstate2.16b
+
+	// any remained blocks?
+	cbnz    $num, .Loop
+
+	// save state
+	rev64   $state1.4s, $state1.4s
+	rev64   $state2.4s, $state2.4s
+	ext     $state1.16b, $state1.16b, $state1.16b, #8
+	ext     $state2.16b, $state2.16b, $state2.16b, #8
+	st1     {$state1.4s-$state2.4s}, [$pstate]
+	ret
+.size	ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
+
+.align	3
+.Tj:
+.word	0x79cc4519, 0x9d8a7a87
+___
+}}}
+
+#########################################
+my %sm3partopcode = (
+	"sm3partw1"         =>   0xce60C000,
+        "sm3partw2"         =>   0xce60C400);
+
+my %sm3ss1opcode = (
+	"sm3ss1"            =>   0xce400000);
+
+my %sm3ttopcode = (
+	"sm3tt1a"           =>   0xce408000,
+	"sm3tt1b"           =>   0xce408400,
+	"sm3tt2a"           =>   0xce408800,
+	"sm3tt2b"           =>   0xce408C00);
+
+sub unsm3part {
+	my ($mnemonic,$arg)=@_;
+
+	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+}
+
+sub unsm3ss1 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
+			$mnemonic,$arg;
+}
+
+sub unsm3tt {
+	my ($mnemonic,$arg)=@_;
+
+	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
+			$mnemonic,$arg;
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
+	s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
+	s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
+index 6009b19..e113729 100644
+--- a/crypto/sm3/build.info
+++ b/crypto/sm3/build.info
+@@ -1,2 +1,15 @@
+ LIBS=../../libcrypto
+-SOURCE[../../libcrypto]=sm3.c m_sm3.c
+SOURCE[../../libcrypto]=\
+        sm3.c m_sm3.c {- $target{sm3_asm_src} -}
+
+GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME)
+INCLUDE[sm3-armv8.o]=..
+
+BEGINRAW[Makefile]
+##### SM3 assembler implementations
+
+# GNU make "catch all"
+{- $builddir -}/sm3-%.S:	{- $sourcedir -}/asm/sm3-%.pl
+	CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+
+ENDRAW[Makefile]
+\ No newline at end of file
+diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
+index 7171de5..aafff63 100644
+--- a/crypto/sm3/sm3_local.h
+++ b/crypto/sm3/sm3_local.h
+@@ -32,7 +32,21 @@
+         ll=(c)->G; (void)HOST_l2c(ll, (s)); \
+         ll=(c)->H; (void)HOST_l2c(ll, (s)); \
+       } while (0)
+-#define HASH_BLOCK_DATA_ORDER   sm3_block_data_order
+
+#if defined(SM3_ASM)
+# if defined(__aarch64__)
+#  include "crypto/arm_arch.h"
+#  define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
+void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+# endif
+#endif
+
+#if defined(HWSM3_CAPABLE)
+# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
+                                              : sm3_block_data_order)
+#else
+# define HASH_BLOCK_DATA_ORDER   sm3_block_data_order
+#endif
+ 
+ void sm3_transform(SM3_CTX *c, const unsigned char *data);
+ 
+-- 
+2.36.1
+
--- a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
+++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
--- a/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
+++ b/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
@ -0,0 +1,621 @@
+From 3f0898b2aea424f18f58a182803478f25548674e Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Wed, 2 Nov 2022 11:13:07 +0800
+Subject: [PATCH 3/3] SM4 XTS optimization for ARM by HW instruction
+
+This patch implements the SM4 XTS optimization for ARM processor,
+using SM4 HW instruction, which is an optional feature of
+crypto extension for aarch64 V8.
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+---
+ crypto/evp/e_sm4.c            |  28 ++
+ crypto/sm4/asm/sm4-armv8.pl   | 498 +++++++++++++++++++++++++++++++++-
+ include/crypto/sm4_platform.h |  14 +
+ 3 files changed, 537 insertions(+), 3 deletions(-)
+
+diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
+index eaa5ba0..da4dbd3 100644
+--- a/crypto/evp/e_sm4.c
+++ b/crypto/evp/e_sm4.c
+@@ -281,6 +281,34 @@ static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+             const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2;
+             xctx->stream_gb = NULL;
+             xctx->stream = NULL;
+#ifdef HWSM4_CAPABLE
+            if (HWSM4_CAPABLE) {
+                if (enc) {
+                    HWSM4_set_encrypt_key(key, &xctx->ks1.ks);
+                    xctx->xts.block1 = (block128_f) HWSM4_encrypt;
+# ifdef HWSM4_xts_encrypt_gb
+                    xctx->stream_gb = HWSM4_xts_encrypt_gb;
+# endif
+# ifdef HWSM4_xts_encrypt
+                    xctx->stream = HWSM4_xts_encrypt;
+# endif
+                } else {
+                    HWSM4_set_decrypt_key(key, &xctx->ks1.ks);
+                    xctx->xts.block1 = (block128_f) HWSM4_decrypt;
+# ifdef HWSM4_xts_decrypt_gb
+                    xctx->stream_gb = HWSM4_xts_decrypt_gb;
+# endif
+# ifdef HWSM4_xts_decrypt
+                    xctx->stream = HWSM4_xts_decrypt;
+# endif
+                }
+                HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks);
+                xctx->xts.block2 = (block128_f) HWSM4_encrypt;
+
+                xctx->xts.key1 = &xctx->ks1;
+                break;
+            } else
+#endif
+ #ifdef VPSM4_EX_CAPABLE
+             if (VPSM4_EX_CAPABLE) {
+                 if (enc) {
+diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
+index dbacad2..923c1c0 100644
+--- a/crypto/sm4/asm/sm4-armv8.pl
+++ b/crypto/sm4/asm/sm4-armv8.pl
+@@ -11,9 +11,9 @@
+ # Oct 2021
+ #
+ 
+-# $output is the last argument if it looks like a file (it has an extension)
+# $outut is the last argument if it looks like a file (it has an extension)
+ # $flavour is the first argument if it doesn't look like a file
+-$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+ $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+ 
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+@@ -21,7 +21,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+ 
+-open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+open OUT,"| \"$^X\" $xlate $flavour \"$outut\""
+     or die "can't call $xlate: $!";
+ *STDOUT=*OUT;
+ 
+@@ -110,6 +110,120 @@ $code.=<<___;
+ ___
+ }
+ 
+sub mov_reg_to_vec() {
+    my $src0 = shift;
+    my $src1 = shift;
+    my $desv = shift;
+$code.=<<___;
+    mov $desv.d[0],$src0
+    mov $desv.d[1],$src1
+#ifdef __ARMEB__
+    rev32  $desv.16b,$desv.16b
+#endif
+___
+}
+
+sub mov_vec_to_reg() {
+    my $srcv = shift;
+    my $des0 = shift;
+    my $des1 = shift;
+$code.=<<___;
+    mov $des0,$srcv.d[0]
+    mov $des1,$srcv.d[1]
+___
+}
+
+sub compute_tweak() {
+    my $src0 = shift;
+    my $src1 = shift;
+    my $des0 = shift;
+    my $des1 = shift;
+    my $tmp0 = shift;
+    my $tmp1 = shift;
+    my $magic = shift;
+$code.=<<___;
+    extr    x$tmp1,$src1,$src1,#32
+    extr    $des1,$src1,$src0,#63
+    and    w$tmp0,w$magic,w$tmp1,asr#31
+    eor    $des0,x$tmp0,$src0,lsl#1
+___
+}
+
+sub compute_tweak_vec() {
+    my $src = shift;
+    my $des = shift;
+    my $tmp0 = shift;
+    my $tmp1 = shift;
+    my $magic = shift;
+    &rbit($tmp1,$src);
+$code.=<<___;
+    shl  $des.16b, $tmp1.16b, #1
+    ext  $tmp0.16b, $tmp1.16b, $tmp1.16b,#15
+    ushr $tmp0.16b, $tmp0.16b, #7
+    mul  $tmp0.16b, $tmp0.16b, $magic.16b
+    eor  $des.16b, $des.16b, $tmp0.16b
+___
+    &rbit($des,$des);
+}
+
+sub mov_en_to_enc(){
+    my $en = shift;
+    my $enc = shift;
+    if ($en eq "en") {
+$code.=<<___;
+        mov   $enc,1
+___
+    } else {
+$code.=<<___;
+        mov   $enc,0
+___
+    }
+}
+
+sub rbit() {
+    my $dst = shift;
+    my $src = shift;
+
+    if ($src and ("$src" ne "$dst")) {
+        if ($standard eq "_gb") {
+$code.=<<___;
+            rbit $dst.16b,$src.16b
+___
+        } else {
+$code.=<<___;
+            mov $dst.16b,$src.16b
+___
+        }
+    } else {
+        if ($standard eq "_gb") {
+$code.=<<___;
+            rbit $dst.16b,$src.16b
+___
+        }
+    }
+}
+
+sub rev32_armeb() {
+    my $dst = shift;
+    my $src = shift;
+
+    if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifdef __ARMEB__
+    rev32    $dst.16b,$src.16b
+#else
+    mov    $dst.16b,$src.16b
+#endif
+___
+    } else {
+$code.=<<___;
+#ifdef __ARMEB__
+    rev32    $dst.16b,$dst.16b
+#endif
+___
+    }
+}
+
+ $code=<<___;
+ #include "arm_arch.h"
+ .arch	armv8-a+crypto
+@@ -595,6 +709,384 @@ $code.=<<___;
+ .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+ ___
+ }}}
+
+
+{{{
+my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5));
+my ($blocks)=("x2");
+my ($enc)=("x6");
+my ($remain)=("x7");
+my @twx=map("x$_",(9..24));
+my $lastBlk=("x25");
+
+my @tweak=map("v$_",(8..15));
+my @dat=map("v$_",(16..23));
+my $lastTweak=("v24");
+
+# x/w/v/q registers for compute tweak
+my ($magic)=("8");
+my ($tmp0,$tmp1)=("26","27");
+my ($qMagic,$vMagic)=("q25","v25");
+my ($vTmp0,$vTmp1)=("v26","v27");
+
+sub gen_xts_do_cipher() {
+$code.=<<___;
+.globl    ${prefix}_xts_do_cipher${standard}
+.type    ${prefix}_xts_do_cipher${standard},%function
+.align    5
+${prefix}_xts_do_cipher${standard}:
+	mov w$magic,0x87
+    ldr $qMagic, =0x01010101010101010101010101010187
+	// used to encrypt the XORed plaintext blocks
+	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64
+	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2]
+    ld1    {@tweak[0].4s}, [$ivp]
+___
+    &rev32(@tweak[0],@tweak[0]);
+    &enc_blk(@tweak[0]);
+	&rev32(@tweak[0],@tweak[0]);
+$code.=<<___;
+	// used to encrypt the initial vector to yield the initial tweak
+	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64
+	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1]
+
+    and    $remain,$len,#0x0F
+    // convert length into blocks
+    lsr	$blocks,$len,4
+    cmp	$blocks,#1						// $len must be at least 16
+    b.lt	99f
+
+    cmp $remain,0						// if $len is a multiple of 16
+    b.eq .xts_encrypt_blocks${standard}
+										// if $len is not a multiple of 16
+    subs $blocks,$blocks,#1
+    b.eq .only_2blks_tweak${standard}	// if $len is less than 32
+
+.xts_encrypt_blocks${standard}:
+___
+    &rbit(@tweak[0],@tweak[0]);
+	&rev32_armeb(@tweak[0],@tweak[0]);
+    &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
+	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic);
+    &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic);
+    &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic);
+	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic);
+    &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic);
+    &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic);
+    &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic);
+$code.=<<___;
+1:
+    cmp    $blocks,#8
+___
+    &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
+    &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
+	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
+    &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
+    &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
+	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
+    &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
+    &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic);
+    &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
+    &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic);
+$code.=<<___;
+    b.lt    2f
+    ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+___
+    &rbit(@tweak[0],@tweak[0]);
+    &rbit(@tweak[1],@tweak[1]);
+    &rbit(@tweak[2],@tweak[2]);
+    &rbit(@tweak[3],@tweak[3]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+    eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+    eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+    ld1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
+___
+    &rbit(@tweak[4],@tweak[4]);
+    &rbit(@tweak[5],@tweak[5]);
+    &rbit(@tweak[6],@tweak[6]);
+    &rbit(@tweak[7],@tweak[7]);
+$code.=<<___;
+    eor @dat[4].16b, @dat[4].16b, @tweak[4].16b
+    eor @dat[5].16b, @dat[5].16b, @tweak[5].16b
+    eor @dat[6].16b, @dat[6].16b, @tweak[6].16b
+    eor @dat[7].16b, @dat[7].16b, @tweak[7].16b
+___
+	&rev32(@dat[0],@dat[0]);
+	&rev32(@dat[1],@dat[1]);
+	&rev32(@dat[2],@dat[2]);
+	&rev32(@dat[3],@dat[3]);
+	&rev32(@dat[4],@dat[4]);
+	&rev32(@dat[5],@dat[5]);
+	&rev32(@dat[6],@dat[6]);
+	&rev32(@dat[7],@dat[7]);
+	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+	&rev32(@dat[0],@dat[0]);
+	&rev32(@dat[1],@dat[1]);
+	&rev32(@dat[2],@dat[2]);
+	&rev32(@dat[3],@dat[3]);
+	&rev32(@dat[4],@dat[4]);
+	&rev32(@dat[5],@dat[5]);
+	&rev32(@dat[6],@dat[6]);
+	&rev32(@dat[7],@dat[7]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+    eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+    eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+    eor @dat[4].16b, @dat[4].16b, @tweak[4].16b
+    eor @dat[5].16b, @dat[5].16b, @tweak[5].16b
+    eor @dat[6].16b, @dat[6].16b, @tweak[6].16b
+    eor @dat[7].16b, @dat[7].16b, @tweak[7].16b
+
+    // save the last tweak
+    mov $lastTweak.16b,@tweak[7].16b
+	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+    subs    $blocks,$blocks,#8
+    b.eq    100f
+    b    1b
+2:
+    // process 4 blocks
+    cmp    $blocks,#4
+    b.lt    1f
+    ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+___
+    &rbit(@tweak[0],@tweak[0]);
+    &rbit(@tweak[1],@tweak[1]);
+    &rbit(@tweak[2],@tweak[2]);
+    &rbit(@tweak[3],@tweak[3]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+    eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+    eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+___
+	&rev32(@dat[0],@dat[0]);
+	&rev32(@dat[1],@dat[1]);
+	&rev32(@dat[2],@dat[2]);
+	&rev32(@dat[3],@dat[3]);
+	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+	&rev32(@dat[0],@dat[0]);
+	&rev32(@dat[1],@dat[1]);
+	&rev32(@dat[2],@dat[2]);
+	&rev32(@dat[3],@dat[3]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+    eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+    eor @dat[3].16b, @dat[3].16b, @tweak[3].16b
+	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+    sub    $blocks,$blocks,#4
+    mov @tweak[0].16b,@tweak[4].16b
+    mov @tweak[1].16b,@tweak[5].16b
+    mov @tweak[2].16b,@tweak[6].16b
+    // save the last tweak
+    mov $lastTweak.16b,@tweak[3].16b
+1:
+    // process last block
+    cmp    $blocks,#1
+    b.lt    100f
+    b.gt    1f
+    ld1	{@dat[0].4s},[$inp],#16
+___
+    &rbit(@tweak[0],@tweak[0]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+___
+	&rev32(@dat[0],@dat[0]);
+	&enc_blk(@dat[0]);
+	&rev32(@dat[0],@dat[0]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    st1    {@dat[0].4s},[$out],#16
+    // save the last tweak
+    mov $lastTweak.16b,@tweak[0].16b
+    b    100f
+1:  // process last 2 blocks
+    cmp    $blocks,#2
+    b.gt    1f
+    ld1    {@dat[0].4s,@dat[1].4s},[$inp],#32
+___
+    &rbit(@tweak[0],@tweak[0]);
+    &rbit(@tweak[1],@tweak[1]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+___
+    &rev32(@dat[0],@dat[0]);
+    &rev32(@dat[1],@dat[1]);
+	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+    &rev32(@dat[0],@dat[0]);
+    &rev32(@dat[1],@dat[1]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+	st1    {@dat[0].4s,@dat[1].4s},[$out],#32
+    // save the last tweak
+    mov $lastTweak.16b,@tweak[1].16b
+    b    100f
+1:  // process last 3 blocks
+    ld1    {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48
+___
+    &rbit(@tweak[0],@tweak[0]);
+    &rbit(@tweak[1],@tweak[1]);
+    &rbit(@tweak[2],@tweak[2]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+    eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+___
+	&rev32(@dat[0],@dat[0]);
+	&rev32(@dat[1],@dat[1]);
+	&rev32(@dat[2],@dat[2]);
+	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+	&rev32(@dat[0],@dat[0]);
+	&rev32(@dat[1],@dat[1]);
+	&rev32(@dat[2],@dat[2]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[0].16b
+    eor @dat[1].16b, @dat[1].16b, @tweak[1].16b
+    eor @dat[2].16b, @dat[2].16b, @tweak[2].16b
+    st1    {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48
+    // save the last tweak
+    mov $lastTweak.16b,@tweak[2].16b
+100:
+    cmp $remain,0
+    b.eq 99f
+
+// This brance calculates the last two tweaks, 
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak${standard}:
+___
+    &rev32_armeb($lastTweak,$lastTweak);
+    &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic);
+    &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic);
+$code.=<<___;
+    b .check_dec${standard}
+
+
+// This brance calculates the last two tweaks, 
+// while the encryption/decryption length is less than 32, who only need two tweaks
+.only_2blks_tweak${standard}:
+    mov @tweak[1].16b,@tweak[0].16b
+___
+    &rev32_armeb(@tweak[1],@tweak[1]);
+    &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic);
+$code.=<<___;
+    b .check_dec${standard}
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec${standard}:
+	// encryption:1 decryption:0
+    cmp $enc,1
+    b.eq .prcess_last_2blks${standard}
+    mov $vTmp0.16B,@tweak[1].16b
+    mov @tweak[1].16B,@tweak[2].16b
+    mov @tweak[2].16B,$vTmp0.16b
+
+.prcess_last_2blks${standard}:
+___
+    &rev32_armeb(@tweak[1],@tweak[1]);
+    &rev32_armeb(@tweak[2],@tweak[2]);
+$code.=<<___;
+    ld1    {@dat[0].4s},[$inp],#16
+    eor @dat[0].16b, @dat[0].16b, @tweak[1].16b
+___
+	&rev32(@dat[0],@dat[0]);
+	&enc_blk(@dat[0]);
+	&rev32(@dat[0],@dat[0]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[1].16b
+    st1    {@dat[0].4s},[$out],#16
+
+    sub $lastBlk,$out,16
+    .loop${standard}:
+        subs $remain,$remain,1
+        ldrb    w$tmp0,[$lastBlk,$remain]
+        ldrb    w$tmp1,[$inp,$remain]
+        strb    w$tmp1,[$lastBlk,$remain]
+        strb    w$tmp0,[$out,$remain]
+    b.gt .loop${standard}
+    ld1        {@dat[0].4s}, [$lastBlk]    
+    eor @dat[0].16b, @dat[0].16b, @tweak[2].16b
+___
+	&rev32(@dat[0],@dat[0]);
+	&enc_blk(@dat[0]);
+	&rev32(@dat[0],@dat[0]);
+$code.=<<___;
+    eor @dat[0].16b, @dat[0].16b, @tweak[2].16b
+    st1        {@dat[0].4s}, [$lastBlk]
+99:
+    ret
+.size    ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard}
+___
+} #end of gen_xts_do_cipher
+
+}}}
+
+{{{
+my ($enc)=("w6");
+
+sub gen_xts_cipher() {
+	my $en = shift;
+$code.=<<___;
+.globl    ${prefix}_xts_${en}crypt${standard}
+.type    ${prefix}_xts_${en}crypt${standard},%function
+.align    5
+${prefix}_xts_${en}crypt${standard}:
+    stp        x15, x16, [sp, #-0x10]!
+    stp        x17, x18, [sp, #-0x10]!
+    stp        x19, x20, [sp, #-0x10]!
+    stp        x21, x22, [sp, #-0x10]!
+    stp        x23, x24, [sp, #-0x10]!
+    stp        x25, x26, [sp, #-0x10]!
+    stp        x27, x28, [sp, #-0x10]!
+    stp        x29, x30, [sp, #-0x10]!
+    stp        d8, d9, [sp, #-0x10]!
+    stp        d10, d11, [sp, #-0x10]!
+    stp        d12, d13, [sp, #-0x10]!
+    stp        d14, d15, [sp, #-0x10]!
+___
+    &mov_en_to_enc($en,$enc);
+$code.=<<___;
+    bl    ${prefix}_xts_do_cipher${standard}
+    ldp        d14, d15, [sp], #0x10
+    ldp        d12, d13, [sp], #0x10
+    ldp        d10, d11, [sp], #0x10
+    ldp        d8, d9, [sp], #0x10
+    ldp        x29, x30, [sp], #0x10
+    ldp        x27, x28, [sp], #0x10
+    ldp        x25, x26, [sp], #0x10
+    ldp        x23, x24, [sp], #0x10
+    ldp        x21, x22, [sp], #0x10
+    ldp        x19, x20, [sp], #0x10
+    ldp        x17, x18, [sp], #0x10
+    ldp        x15, x16, [sp], #0x10
+    ret
+.size    ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard}
+___
+
+} # end of gen_xts_cipher
+$standard="_gb";
+&gen_xts_do_cipher();
+&gen_xts_cipher("en");
+&gen_xts_cipher("de");
+$standard="";
+&gen_xts_do_cipher();
+&gen_xts_cipher("en");
+&gen_xts_cipher("de");
+}}}
+ ########################################
+ {   my  %opcode = (
+         "sm4e"          => 0xcec08400,
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 2f5a6cf..0bde96f 100644
+--- a/include/crypto/sm4_platform.h
+++ b/include/crypto/sm4_platform.h
+@@ -26,6 +26,10 @@
+ #     define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
+ #     define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
+ #     define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
+#     define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb
+#     define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb
+#     define HWSM4_xts_encrypt sm4_v8_xts_encrypt
+#     define HWSM4_xts_decrypt sm4_v8_xts_decrypt
+ #   endif
+ #  endif
+ # endif /* OPENSSL_CPUID_OBJ */
+@@ -46,6 +50,16 @@ void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                 size_t len, const void *key,
+                                 const unsigned char ivec[16]);
+/* xts mode in GB/T 17964-2021 */
+void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+    const SM4_KEY *key2, const uint8_t iv[16]);
+void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+    const SM4_KEY *key2, const uint8_t iv[16]);
+/* xts mode in IEEE Std 1619-2007 */
+void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+    const SM4_KEY *key2, const uint8_t iv[16]);
+void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1,
+    const SM4_KEY *key2, const uint8_t iv[16]);
+ # endif /* HWSM4_CAPABLE */
+ 
+ #ifdef VPSM4_EX_CAPABLE
+-- 
+2.36.1
+
--- a/openssl.spec
+++ b/openssl.spec
@ -2,7 +2,7 @@
 Name:        openssl
 Epoch:       1
 Version:     1.1.1m
-Release:     11
+Release:     12
 Summary:     Cryptography and SSL/TLS Toolkit
 License:     OpenSSL and SSLeay
 URL:         https://www.openssl.org/
@ -34,7 +34,10 @@ Patch23:     CVE-2022-2068-Fix-file-operations-in-c_rehash.patch
 Patch24:     CVE-2022-2097-Fix-AES-OCB-encrypt-decrypt-for-x86-AES-NI.patch
 Patch25:     Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch
 Patch26:     Fix-reported-performance-degradation-on-aarch64.patch 
-Patch27:    Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch
+Patch27:     Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch
+Patch28:     Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
+Patch29:     Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
+Patch30:     Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch
 
 BuildRequires: gcc perl make lksctp-tools-devel coreutils util-linux zlib-devel
 Requires:    coreutils %{name}-libs%{?_isa} = %{epoch}:%{version}-%{release}
@ -237,6 +240,11 @@ make test || :
 %ldconfig_scriptlets libs

 %changelog
+* Wed Nov 2 2022 Xu Yizhou <xuyizhou1@huawei.com> - 1:1.1.1m-12
+- SM3 acceleration with SM3 hardware instruction on aarch64
+- SM4 optimization for ARM by HW instruction
+- SM4 XTS optimization for ARM by HW instruction
+
 * Wed Oct 26 2022 luhuaxin <luhuaxin1@huawei.com> - 1:1.1.1m-11
 - fix cms testcase