diff --git a/0009-uadk-sec-move-function-to-wd_digest_drv.h.patch b/0009-uadk-sec-move-function-to-wd_digest_drv.h.patch new file mode 100644 index 0000000..783ea8d --- /dev/null +++ b/0009-uadk-sec-move-function-to-wd_digest_drv.h.patch @@ -0,0 +1,201 @@ +From b1eeb7ddb8305466cdfb4e49cc68b0b4264d4a43 Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Mon, 5 Feb 2024 17:24:21 +0800 +Subject: [PATCH 09/44] uadk/sec: move function to wd_digest_drv.h + +Since function get_hash_bd_type() will be used in multiple files, +move it to wd_digest_drv.h. And rename get_hash_bd_type to +get_hash_msg_type to make the function generic. + +Signed-off-by: Weili Qian +--- + drv/hisi_sec.c | 52 ++++++++++--------------------------- + include/drv/wd_digest_drv.h | 27 ++++++++++++++++++- + 2 files changed, 39 insertions(+), 40 deletions(-) + +diff --git a/drv/hisi_sec.c b/drv/hisi_sec.c +index 5b114f6..9da21a8 100644 +--- a/drv/hisi_sec.c ++++ b/drv/hisi_sec.c +@@ -200,13 +200,6 @@ enum sec_c_width { + C_WIDTH_CS3 = 0x3, + }; + +-enum hash_bd_type { +- HASH_SINGLE_BD, +- HASH_FRIST_BD, +- HASH_MIDDLE_BD, +- HASH_END_BD, +-}; +- + struct hisi_sec_ctx { + struct wd_ctx_config_internal config; + }; +@@ -1549,29 +1542,10 @@ static int long_hash_param_check(handle_t h_qp, struct wd_digest_msg *msg) + return 0; + } + +-static enum hash_bd_type get_hash_bd_type(struct wd_digest_msg *msg) +-{ +- /* +- * [has_next , iv_bytes] +- * [ 1 , 0 ] = long hash(frist bd) +- * [ 1 , 1 ] = long hash(middle bd) +- * [ 0 , 1 ] = long hash(end bd) +- * [ 0 , 0 ] = block hash(single bd) +- */ +- if (msg->has_next && !msg->iv_bytes) +- return HASH_FRIST_BD; +- else if (msg->has_next && msg->iv_bytes) +- return HASH_MIDDLE_BD; +- else if (!msg->has_next && msg->iv_bytes) +- return HASH_END_BD; +- else +- return HASH_SINGLE_BD; +-} +- + static int fill_digest_long_hash(handle_t h_qp, struct wd_digest_msg *msg, + struct hisi_sec_sqe *sqe) + { +- enum hash_bd_type bd_type = get_hash_bd_type(msg); ++ enum hash_block_type block_type = get_hash_block_type(msg); + __u64 total_bits; + int ret; + +@@ -1579,20 +1553,20 @@ static int fill_digest_long_hash(handle_t h_qp, struct wd_digest_msg *msg, + if (ret) + return ret; + +- if (bd_type == HASH_FRIST_BD) { ++ if (block_type == HASH_FRIST_BLOCK) { + /* Long hash first */ + sqe->ai_apd_cs = AI_GEN_INNER; + sqe->ai_apd_cs |= AUTHPAD_NOPAD << AUTHPAD_OFFSET; + } + +- if (bd_type == HASH_MIDDLE_BD) { ++ if (block_type == HASH_MIDDLE_BLOCK) { + /* Long hash middle */ + sqe->ai_apd_cs = AI_GEN_IVIN_ADDR; + sqe->ai_apd_cs |= AUTHPAD_NOPAD << AUTHPAD_OFFSET; + sqe->type2.a_ivin_addr = sqe->type2.mac_addr; + } + +- if (bd_type == HASH_END_BD) { ++ if (block_type == HASH_END_BLOCK) { + /* Long hash end */ + sqe->ai_apd_cs = AI_GEN_IVIN_ADDR; + sqe->ai_apd_cs |= AUTHPAD_PAD << AUTHPAD_OFFSET; +@@ -1658,16 +1632,16 @@ static int digest_long_bd_align_check(struct wd_digest_msg *msg) + + static int digest_bd2_type_check(struct wd_digest_msg *msg) + { +- enum hash_bd_type type = get_hash_bd_type(msg); ++ enum hash_block_type type = get_hash_block_type(msg); + + /* Long hash first and middle bd */ +- if (type == HASH_FRIST_BD || type == HASH_MIDDLE_BD) { ++ if (type == HASH_FRIST_BLOCK || type == HASH_MIDDLE_BLOCK) { + WD_ERR("hardware v2 not supports 0 size in long hash!\n"); + return -WD_EINVAL; + } + + /* Block mode hash bd */ +- if (type == HASH_SINGLE_BD) { ++ if (type == HASH_SINGLE_BLOCK) { + WD_ERR("hardware v2 not supports 0 size in block hash!\n"); + return -WD_EINVAL; + } +@@ -1677,9 +1651,9 @@ static int digest_bd2_type_check(struct wd_digest_msg *msg) + + static int digest_bd3_type_check(struct wd_digest_msg *msg) + { +- enum hash_bd_type type = get_hash_bd_type(msg); ++ enum hash_block_type type = get_hash_block_type(msg); + /* Long hash first and middle bd */ +- if (type == HASH_FRIST_BD || type == HASH_MIDDLE_BD) { ++ if (type == HASH_FRIST_BLOCK || type == HASH_MIDDLE_BLOCK) { + WD_ERR("invalid: hardware v3 not supports 0 size in long hash!\n"); + return -WD_EINVAL; + } +@@ -1920,7 +1894,7 @@ static int aes_auth_long_hash_check(struct wd_digest_msg *msg) + static int fill_digest_long_hash3(handle_t h_qp, struct wd_digest_msg *msg, + struct hisi_sec_sqe3 *sqe) + { +- enum hash_bd_type bd_type = get_hash_bd_type(msg); ++ enum hash_block_type block_type = get_hash_block_type(msg); + __u64 total_bits; + int ret; + +@@ -1932,20 +1906,20 @@ static int fill_digest_long_hash3(handle_t h_qp, struct wd_digest_msg *msg, + if (ret) + return ret; + +- if (bd_type == HASH_FRIST_BD) { ++ if (block_type == HASH_FRIST_BLOCK) { + /* Long hash first */ + sqe->auth_mac_key |= AI_GEN_INNER << SEC_AI_GEN_OFFSET_V3; + sqe->stream_scene.stream_auth_pad = AUTHPAD_NOPAD; + } + +- if (bd_type == HASH_MIDDLE_BD) { ++ if (block_type == HASH_MIDDLE_BLOCK) { + /* Long hash middle */ + sqe->auth_mac_key |= AI_GEN_IVIN_ADDR << SEC_AI_GEN_OFFSET_V3; + sqe->stream_scene.stream_auth_pad = AUTHPAD_NOPAD; + sqe->auth_ivin.a_ivin_addr = sqe->mac_addr; + } + +- if (bd_type == HASH_END_BD) { ++ if (block_type == HASH_END_BLOCK) { + /* Long hash end */ + sqe->auth_mac_key |= AI_GEN_IVIN_ADDR << SEC_AI_GEN_OFFSET_V3; + sqe->stream_scene.stream_auth_pad = AUTHPAD_PAD; +diff --git a/include/drv/wd_digest_drv.h b/include/drv/wd_digest_drv.h +index 3c4477d..304b506 100644 +--- a/include/drv/wd_digest_drv.h ++++ b/include/drv/wd_digest_drv.h +@@ -10,7 +10,13 @@ + extern "C" { + #endif + +-/* fixme wd_digest_msg */ ++enum hash_block_type { ++ HASH_FRIST_BLOCK, ++ HASH_MIDDLE_BLOCK, ++ HASH_END_BLOCK, ++ HASH_SINGLE_BLOCK, ++}; ++ + struct wd_digest_msg { + struct wd_digest_req req; + /* request identifier */ +@@ -51,6 +57,25 @@ struct wd_digest_msg { + __u64 long_data_len; + }; + ++static inline enum hash_block_type get_hash_block_type(struct wd_digest_msg *msg) ++{ ++ /* ++ * [has_next , iv_bytes] ++ * [ 1 , 0 ] = long hash(frist bd) ++ * [ 1 , 1 ] = long hash(middle bd) ++ * [ 0 , 1 ] = long hash(end bd) ++ * [ 0 , 0 ] = block hash(single bd) ++ */ ++ if (msg->has_next && !msg->iv_bytes) ++ return HASH_FRIST_BLOCK; ++ else if (msg->has_next && msg->iv_bytes) ++ return HASH_MIDDLE_BLOCK; ++ else if (!msg->has_next && msg->iv_bytes) ++ return HASH_END_BLOCK; ++ else ++ return HASH_SINGLE_BLOCK; ++} ++ + struct wd_digest_msg *wd_digest_get_msg(__u32 idx, __u32 tag); + + #ifdef __cplusplus +-- +2.25.1 + diff --git a/0010-uadk-digest-add-partial_block-to-store-partial-data.patch b/0010-uadk-digest-add-partial_block-to-store-partial-data.patch new file mode 100644 index 0000000..70e5a46 --- /dev/null +++ b/0010-uadk-digest-add-partial_block-to-store-partial-data.patch @@ -0,0 +1,149 @@ +From 6ad149cab59176faf05e65233b4986916a1f7c8d Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Mon, 5 Feb 2024 17:27:07 +0800 +Subject: [PATCH 10/44] uadk/digest: add partial_block to store partial data + +For the long hash first blcok and middle block, if the size of +the data is not aligned with the block size, the partial data is +stored in partial_block and combined with the next block to form +an aligned length for calculation. Currently, partial_block is +added to struct wd_digest_sess to store partial data. + +Signed-off-by: Weili Qian +--- + include/drv/wd_digest_drv.h | 4 +++ + wd_digest.c | 50 ++++++++++++++++++++++++------------- + 2 files changed, 36 insertions(+), 18 deletions(-) + +diff --git a/include/drv/wd_digest_drv.h b/include/drv/wd_digest_drv.h +index 304b506..8a4aa0b 100644 +--- a/include/drv/wd_digest_drv.h ++++ b/include/drv/wd_digest_drv.h +@@ -44,6 +44,8 @@ struct wd_digest_msg { + __u32 in_bytes; + /* out_bytes */ + __u32 out_bytes; ++ /* partial bytes for stream mode */ ++ __u32 partial_bytes; + + /* input key pointer */ + __u8 *key; +@@ -53,6 +55,8 @@ struct wd_digest_msg { + __u8 *in; + /* output data pointer */ + __u8 *out; ++ /* partial pointer for stream mode */ ++ __u8 *partial_block; + /* total of data for stream mode */ + __u64 long_data_len; + }; +diff --git a/wd_digest.c b/wd_digest.c +index acf341a..dba2f95 100644 +--- a/wd_digest.c ++++ b/wd_digest.c +@@ -11,6 +11,7 @@ + #include "wd_digest.h" + + #define GMAC_IV_LEN 16 ++#define MAX_BLOCK_SIZE 128 + + static __u32 g_digest_mac_len[WD_DIGEST_TYPE_MAX] = { + WD_DIGEST_SM3_LEN, WD_DIGEST_MD5_LEN, WD_DIGEST_SHA1_LEN, +@@ -45,6 +46,19 @@ struct wd_digest_setting { + void *dlh_list; + } wd_digest_setting; + ++struct wd_digest_stream_data { ++ /* Long hash mode, first and middle block misaligned data */ ++ unsigned char partial_block[MAX_BLOCK_SIZE]; ++ __u32 partial_bytes; ++ /* Total data length for stream mode */ ++ __u64 long_data_len; ++ /* ++ * Notify the stream message state, zero is frist message, ++ * non-zero is middle or final message. ++ */ ++ int msg_state; ++}; ++ + struct wd_digest_sess { + char *alg_name; + enum wd_digest_type alg; +@@ -53,14 +67,7 @@ struct wd_digest_sess { + unsigned char key[MAX_HMAC_KEY_SIZE]; + __u32 key_bytes; + void *sched_key; +- /* +- * Notify the stream message state, zero is frist message, +- * non-zero is middle or final message. +- */ +- int msg_state; +- +- /* Total data length for stream mode */ +- __u64 long_data_len; ++ struct wd_digest_stream_data stream_data; + }; + + struct wd_env_config wd_digest_env_config; +@@ -536,12 +543,12 @@ static void fill_request_msg(struct wd_digest_msg *msg, + memcpy(&msg->req, req, sizeof(struct wd_digest_req)); + + if (unlikely(req->has_next == WD_DIGEST_STREAM_END)) { +- sess->long_data_len = req->long_data_len; +- sess->msg_state = WD_DIGEST_DOING; ++ sess->stream_data.long_data_len = req->long_data_len; ++ sess->stream_data.msg_state = WD_DIGEST_DOING; + req->has_next = WD_DIGEST_END; + } else if (unlikely(req->has_next == WD_DIGEST_STREAM_DOING)) { +- sess->long_data_len = req->long_data_len; +- sess->msg_state = WD_DIGEST_DOING; ++ sess->stream_data.long_data_len = req->long_data_len; ++ sess->stream_data.msg_state = WD_DIGEST_DOING; + req->has_next = WD_DIGEST_DOING; + } + +@@ -557,10 +564,12 @@ static void fill_request_msg(struct wd_digest_msg *msg, + msg->out_bytes = req->out_bytes; + msg->data_fmt = req->data_fmt; + msg->has_next = req->has_next; +- msg->long_data_len = sess->long_data_len + req->in_bytes; ++ msg->long_data_len = sess->stream_data.long_data_len + req->in_bytes; ++ msg->partial_block = sess->stream_data.partial_block; ++ msg->partial_bytes = sess->stream_data.partial_bytes; + + /* Use iv_bytes to store the stream message state */ +- msg->iv_bytes = sess->msg_state; ++ msg->iv_bytes = sess->stream_data.msg_state; + } + + static int send_recv_sync(struct wd_ctx_internal *ctx, struct wd_digest_sess *dsess, +@@ -579,17 +588,22 @@ static int send_recv_sync(struct wd_ctx_internal *ctx, struct wd_digest_sess *ds + if (unlikely(ret)) + return ret; + +- /* After a stream mode job was done, update session long_data_len */ ++ /* ++ * After a stream mode job was done, update session ++ * long_data_len and partial_bytes. ++ */ + if (msg->has_next) { + /* Long hash(first and middle message) */ +- dsess->long_data_len += msg->in_bytes; ++ dsess->stream_data.long_data_len += msg->in_bytes; ++ dsess->stream_data.partial_bytes = msg->partial_bytes; + } else if (msg->iv_bytes) { + /* Long hash(final message) */ +- dsess->long_data_len = 0; ++ dsess->stream_data.long_data_len = 0; ++ dsess->stream_data.partial_bytes = 0; + } + + /* Update session message state */ +- dsess->msg_state = msg->has_next; ++ dsess->stream_data.msg_state = msg->has_next; + + return 0; + } +-- +2.25.1 + diff --git a/0011-uadk-digest-add-wd_ctx_spin_lock-function.patch b/0011-uadk-digest-add-wd_ctx_spin_lock-function.patch new file mode 100644 index 0000000..6af2e7e --- /dev/null +++ b/0011-uadk-digest-add-wd_ctx_spin_lock-function.patch @@ -0,0 +1,77 @@ +From b06161de909136e59ecd7f148ef7e8ba72652e34 Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Mon, 5 Feb 2024 17:27:17 +0800 +Subject: [PATCH 11/44] uadk/digest: add wd_ctx_spin_lock function + +In synchronous mode, to protect hardware queue resources and +prevent multiple threads from sending packets to the same queue +at the same time, lock is added before packets are sent in function +send_recv_sync(). + +In non-hard computing scenarios, the resources are independent, +and multiple synchronization threads can process at the same time. +If lock is added before packets are sent, the multi-thread performance +deteriorates. Therefore, the wd_ctx_spin_lock and wd_ctx_spin_unlock +interfaces are added. In non-hard computing scenarios, the lock +is not added. + +Signed-off-by: Weili Qian +--- + include/wd_util.h | 23 +++++++++++++++++++++++ + wd_digest.c | 4 ++-- + 2 files changed, 25 insertions(+), 2 deletions(-) + +diff --git a/include/wd_util.h b/include/wd_util.h +index 3059ac1..f217f0f 100644 +--- a/include/wd_util.h ++++ b/include/wd_util.h +@@ -527,6 +527,29 @@ static inline void wd_dfx_msg_cnt(struct wd_ctx_config_internal *config, + config->msg_cnt[sqn]++; + } + ++/** ++ * wd_ctx_spin_lock() - Lock interface, which is used in the synchronization process. ++ * @ctx: queue context. ++ * @type: the type of the driver. ++ * ++ * If the drvier type is not UADK_ALG_HW, the lock is not required. ++ */ ++static inline void wd_ctx_spin_lock(struct wd_ctx_internal *ctx, int type) ++{ ++ if (type != UADK_ALG_HW) ++ return; ++ ++ pthread_spin_lock(&ctx->lock); ++} ++ ++static inline void wd_ctx_spin_unlock(struct wd_ctx_internal *ctx, int type) ++{ ++ if (type != UADK_ALG_HW) ++ return; ++ ++ pthread_spin_unlock(&ctx->lock); ++} ++ + #ifdef __cplusplus + } + #endif +diff --git a/wd_digest.c b/wd_digest.c +index dba2f95..c59184d 100644 +--- a/wd_digest.c ++++ b/wd_digest.c +@@ -581,10 +581,10 @@ static int send_recv_sync(struct wd_ctx_internal *ctx, struct wd_digest_sess *ds + msg_handle.send = wd_digest_setting.driver->send; + msg_handle.recv = wd_digest_setting.driver->recv; + +- pthread_spin_lock(&ctx->lock); ++ wd_ctx_spin_lock(ctx, wd_digest_setting.driver->calc_type); + ret = wd_handle_msg_sync(wd_digest_setting.driver, &msg_handle, ctx->ctx, + msg, NULL, wd_digest_setting.config.epoll_en); +- pthread_spin_unlock(&ctx->lock); ++ wd_ctx_spin_unlock(ctx, wd_digest_setting.driver->calc_type); + if (unlikely(ret)) + return ret; + +-- +2.25.1 + diff --git a/0012-uadk-remove-redundant-header-file-in-makefile.patch b/0012-uadk-remove-redundant-header-file-in-makefile.patch new file mode 100644 index 0000000..e07651b --- /dev/null +++ b/0012-uadk-remove-redundant-header-file-in-makefile.patch @@ -0,0 +1,28 @@ +From 415b2d379fd74e1e115c9f15b86e976f5c5addb7 Mon Sep 17 00:00:00 2001 +From: Zhiqi Song +Date: Tue, 12 Mar 2024 11:38:46 +0800 +Subject: [PATCH 12/44] uadk: remove redundant header file in makefile + +Remove wrong 'wd_hpre_udrv.h' of hpre. + +Signed-off-by: Zhiqi Song +--- + Makefile.am | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Makefile.am b/Makefile.am +index 64cfa44..25853eb 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -86,7 +86,7 @@ libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \ + hisi_qm_udrv.h wd_cipher_drv.h wd_aead_drv.h aes.h galois.h + + libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ +- hisi_qm_udrv.h wd_hpre_drv.h ++ hisi_qm_udrv.h + if WD_STATIC_DRV + AM_CFLAGS += -DWD_STATIC_DRV -fPIC + AM_CFLAGS += -DWD_NO_LOG +-- +2.25.1 + diff --git a/0013-uadk-isa-ce-support-sm3-ce-instruction.patch b/0013-uadk-isa-ce-support-sm3-ce-instruction.patch new file mode 100644 index 0000000..4705b71 --- /dev/null +++ b/0013-uadk-isa-ce-support-sm3-ce-instruction.patch @@ -0,0 +1,1888 @@ +From da5f058d30f6d7eb28b4afbe27633d7664ba0961 Mon Sep 17 00:00:00 2001 +From: Zhiqi Song +Date: Mon, 11 Mar 2024 18:07:22 +0800 +Subject: [PATCH 13/44] uadk/isa-ce: support sm3 ce instruction + +Support sync sm3 ce instruction, users can use ce +instruction to accelerate sm3 sync task through init2 +related functions. + +This patch also includes: +1. Add compile parameter and related file to support +isa-ce library. +2. Check whether the platform supports the CE instruction +in alg driver register process. +3. Make HW driver and INSTR driver of the same alg can +be requested at the same time. +4. Support sm3 ce block mode and stream mode for sm3-normal +and hmac-sm3. + +Signed-off-by: Zhiqi Song +--- + Makefile.am | 15 +- + configure.ac | 3 + + drv/isa_ce_sm3.c | 401 ++++++++++++++++++++ + drv/isa_ce_sm3.h | 86 +++++ + drv/isa_ce_sm3_armv8.S | 765 ++++++++++++++++++++++++++++++++++++++ + include/drv/arm_arch_ce.h | 199 ++++++++++ + include/wd_alg.h | 43 +++ + wd_alg.c | 32 +- + wd_digest.c | 2 +- + wd_sched.c | 2 +- + wd_util.c | 87 ++++- + 11 files changed, 1616 insertions(+), 19 deletions(-) + create mode 100644 drv/isa_ce_sm3.c + create mode 100644 drv/isa_ce_sm3.h + create mode 100644 drv/isa_ce_sm3_armv8.S + create mode 100644 include/drv/arm_arch_ce.h + +diff --git a/Makefile.am b/Makefile.am +index 25853eb..19eab30 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -43,7 +43,8 @@ nobase_pkginclude_HEADERS = v1/wd.h v1/wd_cipher.h v1/wd_aead.h v1/uacce.h v1/wd + lib_LTLIBRARIES=libwd.la libwd_comp.la libwd_crypto.la + + uadk_driversdir=$(libdir)/uadk +-uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la ++uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la \ ++ libisa_ce.la + + libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \ + v1/wd.c v1/wd.h v1/wd_adapter.c v1/wd_adapter.h \ +@@ -87,6 +88,10 @@ libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \ + + libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ + hisi_qm_udrv.h ++ ++libisa_ce_la_SOURCES=drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S arm_arch_ce.h \ ++ drv/isa_ce_sm3.h ++ + if WD_STATIC_DRV + AM_CFLAGS += -DWD_STATIC_DRV -fPIC + AM_CFLAGS += -DWD_NO_LOG +@@ -106,6 +111,10 @@ libhisi_sec_la_DEPENDENCIES = libwd.la libwd_crypto.la + + libhisi_hpre_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) + libhisi_hpre_la_DEPENDENCIES = libwd.la libwd_crypto.la ++ ++libisa_ce_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) ++libisa_ce_la_DEPENDENCIES = libwd.la libwd_crypto.la ++ + else + UADK_WD_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd.map + UADK_CRYPTO_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd_crypto.map +@@ -134,6 +143,10 @@ libhisi_sec_la_DEPENDENCIES= libwd.la libwd_crypto.la + libhisi_hpre_la_LIBADD= -lwd -lwd_crypto + libhisi_hpre_la_LDFLAGS=$(UADK_VERSION) + libhisi_hpre_la_DEPENDENCIES= libwd.la libwd_crypto.la ++ ++libisa_ce_la_LIBADD= -lwd -lwd_crypto ++libisa_ce_la_LDFLAGS=$(UADK_VERSION) ++libisa_ce_la_DEPENDENCIES= libwd.la libwd_crypto.la + endif # WD_STATIC_DRV + + pkgconfigdir = $(libdir)/pkgconfig +diff --git a/configure.ac b/configure.ac +index b198417..4ed111e 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -21,6 +21,9 @@ LT_INIT + AC_SUBST([hardcode_into_libs], [no]) + AM_PROG_CC_C_O + ++# Support assembler ++AM_PROG_AS ++ + AC_ARG_ENABLE([debug-log], + AS_HELP_STRING([--enable-debug-log], [enable debug logging globally]), + [ AS_IF([test "x$enable_debug_log" = "xyes"], +diff --git a/drv/isa_ce_sm3.c b/drv/isa_ce_sm3.c +new file mode 100644 +index 0000000..f16bdd3 +--- /dev/null ++++ b/drv/isa_ce_sm3.c +@@ -0,0 +1,401 @@ ++// SPDX-License-Identifier: Apache-2.0 ++/* ++ * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++/* ++ * Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include "drv/isa_ce_sm3.h" ++#include "drv/wd_digest_drv.h" ++#include "wd_digest.h" ++#include "wd_util.h" ++ ++typedef void (sm3_ce_block_fn)(__u32 word_reg[SM3_STATE_WORDS], ++ const unsigned char *src, size_t blocks); ++ ++static int sm3_ce_drv_init(struct wd_alg_driver *drv, void *conf); ++static void sm3_ce_drv_exit(struct wd_alg_driver *drv); ++static int sm3_ce_drv_send(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg); ++static int sm3_ce_drv_recv(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg); ++static int sm3_ce_get_usage(void *param); ++ ++static struct wd_alg_driver sm3_ce_alg_driver = { ++ .drv_name = "isa_ce_sm3", ++ .alg_name = "sm3", ++ .calc_type = UADK_ALG_CE_INSTR, ++ .priority = 200, ++ .queue_num = 1, ++ .op_type_num = 1, ++ .fallback = 0, ++ .init = sm3_ce_drv_init, ++ .exit = sm3_ce_drv_exit, ++ .send = sm3_ce_drv_send, ++ .recv = sm3_ce_drv_recv, ++ .get_usage = sm3_ce_get_usage, ++}; ++ ++static void __attribute__((constructor)) sm3_ce_probe(void) ++{ ++ int ret; ++ ++ WD_INFO("Info: register SM3 CE alg driver!\n"); ++ ret = wd_alg_driver_register(&sm3_ce_alg_driver); ++ if (ret && ret != -WD_ENODEV) ++ WD_ERR("Error: register SM3 CE failed!\n"); ++} ++ ++static void __attribute__((destructor)) sm3_ce_remove(void) ++{ ++ wd_alg_driver_unregister(&sm3_ce_alg_driver); ++} ++ ++static int sm3_ce_get_usage(void *param) ++{ ++ return WD_SUCCESS; ++} ++ ++static inline void sm3_ce_init(struct sm3_ce_ctx *sctx) ++{ ++ sctx->word_reg[0] = SM3_IVA; ++ sctx->word_reg[1] = SM3_IVB; ++ sctx->word_reg[2] = SM3_IVC; ++ sctx->word_reg[3] = SM3_IVD; ++ sctx->word_reg[4] = SM3_IVE; ++ sctx->word_reg[5] = SM3_IVF; ++ sctx->word_reg[6] = SM3_IVG; ++ sctx->word_reg[7] = SM3_IVH; ++} ++ ++static void trans_output_result(__u8 *out_digest, __u32 *word_reg) ++{ ++ size_t i; ++ ++ for (i = 0; i < SM3_STATE_WORDS; i++) ++ PUTU32_TO_U8(out_digest + i * WORD_TO_CHAR_OFFSET, word_reg[i]); ++} ++ ++static void sm3_ce_init_ex(struct sm3_ce_ctx *sctx, __u8 *iv, __u16 iv_bytes) ++{ ++ size_t i; ++ ++ if (iv_bytes != SM3_DIGEST_SIZE) { ++ WD_ERR("invalid iv size: %u\n", iv_bytes); ++ return; ++ } ++ ++ for (i = 0; i < SM3_STATE_WORDS; i++) ++ PUTU8_TO_U32(sctx->word_reg[i], iv + i * WORD_TO_CHAR_OFFSET); ++} ++ ++static void sm3_ce_update(struct sm3_ce_ctx *sctx, const __u8 *data, ++ size_t data_len, sm3_ce_block_fn *block_fn) ++{ ++ size_t remain_data_len, blk_num; ++ ++ /* Get the data num that need compute currently */ ++ sctx->num &= (SM3_BLOCK_SIZE - 1); ++ ++ if (sctx->num) { ++ remain_data_len = SM3_BLOCK_SIZE - sctx->num; ++ /* If data_len does not enough a block size, then leave it to final */ ++ if (data_len < remain_data_len) { ++ memcpy(sctx->block + sctx->num, data, data_len); ++ sctx->num += data_len; ++ return; ++ } ++ ++ memcpy(sctx->block + sctx->num, data, remain_data_len); ++ block_fn(sctx->word_reg, sctx->block, 1); ++ sctx->nblocks++; ++ data += remain_data_len; ++ data_len -= remain_data_len; ++ } ++ ++ /* Group the filled msg by 512-bits (64-bytes) */ ++ blk_num = data_len / SM3_BLOCK_SIZE; ++ if (blk_num) { ++ block_fn(sctx->word_reg, data, blk_num); ++ sctx->nblocks += blk_num; ++ data += SM3_BLOCK_SIZE * blk_num; ++ data_len -= SM3_BLOCK_SIZE * blk_num; ++ } ++ ++ sctx->num = data_len; ++ if (data_len) ++ memcpy(sctx->block, data, data_len); ++} ++ ++static void sm3_ce_final(struct sm3_ce_ctx *sctx, __u8 *md, ++ sm3_ce_block_fn *block_fn) ++{ ++ size_t i, offset1, offset2; ++ __u64 nh, nl; ++ ++ sctx->num &= (SM3_BLOCK_SIZE - 1); ++ sctx->block[sctx->num] = SM3_PADDING_BYTE; ++ ++ if (sctx->num <= SM3_BLOCK_SIZE - BIT_TO_BLOCK_OFFSET) { ++ memset(sctx->block + sctx->num + 1, 0, SM3_BLOCK_SIZE - sctx->num - 9); ++ } else { ++ memset(sctx->block + sctx->num + 1, 0, SM3_BLOCK_SIZE - sctx->num - 1); ++ block_fn(sctx->word_reg, sctx->block, 1); ++ memset(sctx->block, 0, SM3_BLOCK_SIZE - 8); ++ } ++ ++ /* ++ * Put the length of the message in bits into the last ++ * 64-bits (penultimate two words). ++ */ ++ offset2 = SM3_BLOCK_SIZE - WORD_TO_CHAR_OFFSET * 2; ++ offset1 = SM3_BLOCK_SIZE - WORD_TO_CHAR_OFFSET; ++ nh = sctx->nblocks >> NH_OFFSET; ++ nl = (sctx->nblocks << BIT_TO_BLOCK_OFFSET) + (sctx->num << BIT_TO_BYTE_OFFSET); ++ PUTU32_TO_U8(sctx->block + offset2 , nh); ++ PUTU32_TO_U8(sctx->block + offset1, nl); ++ ++ block_fn(sctx->word_reg, sctx->block, 1); ++ for (i = 0; i < SM3_STATE_WORDS; i++) ++ PUTU32_TO_U8(md + i * WORD_TO_CHAR_OFFSET, sctx->word_reg[i]); ++} ++ ++static int do_sm3_ce(struct wd_digest_msg *msg, __u8 *out_digest) ++{ ++ enum hash_block_type block_type; ++ struct sm3_ce_ctx sctx = {0}; ++ size_t data_len, iv_len; ++ __u8 *data, *iv; ++ ++ block_type = get_hash_block_type(msg); ++ data_len = msg->in_bytes; ++ data = msg->in; ++ iv_len = SM3_DIGEST_SIZE; ++ /* Use last output as the iv in current cycle */ ++ iv = msg->out; ++ ++ switch(block_type) { ++ case HASH_SINGLE_BLOCK: ++ sm3_ce_init(&sctx); ++ sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); ++ sm3_ce_final(&sctx, out_digest, sm3_ce_block_compress); ++ break; ++ case HASH_FRIST_BLOCK: ++ sm3_ce_init(&sctx); ++ sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); ++ trans_output_result(out_digest, sctx.word_reg); ++ break; ++ case HASH_MIDDLE_BLOCK: ++ sm3_ce_init_ex(&sctx, iv, iv_len); ++ sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); ++ /* Transform the middle result without final padding */ ++ trans_output_result(out_digest, sctx.word_reg); ++ break; ++ case HASH_END_BLOCK: ++ sm3_ce_init_ex(&sctx, iv, iv_len); ++ sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); ++ /* Put the whole message length in last 64-bits */ ++ sctx.nblocks = msg->long_data_len / SM3_BLOCK_SIZE; ++ sm3_ce_final(&sctx, out_digest, sm3_ce_block_compress); ++ break; ++ default: ++ WD_ERR("Invalid block type!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (msg->out_bytes < SM3_DIGEST_SIZE) ++ memcpy(msg->out, out_digest, msg->out_bytes); ++ else ++ memcpy(msg->out, out_digest, SM3_DIGEST_SIZE); ++ ++ memset(&sctx, 0, sizeof(struct sm3_ce_ctx)); ++ ++ return WD_SUCCESS; ++} ++ ++static void sm3_hmac_key_padding(struct hmac_sm3_ctx *hctx, ++ const __u8 *key, size_t key_len) ++{ ++ size_t i; ++ ++ if (key_len <= SM3_BLOCK_SIZE) { ++ memcpy(hctx->key, key, key_len); ++ memset(hctx->key + key_len, 0, SM3_BLOCK_SIZE - key_len); ++ } else { ++ sm3_ce_init(&hctx->sctx); ++ sm3_ce_update(&hctx->sctx, key, key_len, sm3_ce_block_compress); ++ sm3_ce_final(&hctx->sctx, hctx->key, sm3_ce_block_compress); ++ /* Pad key to SM3_BLOCK_SIZE after hash */ ++ memset(hctx->key + SM3_DIGEST_SIZE, 0, ++ SM3_BLOCK_SIZE - SM3_DIGEST_SIZE); ++ } ++ ++ for (i = 0; i < SM3_BLOCK_SIZE; i++) { ++ hctx->key[i] ^= IPAD_DATA; ++ } ++} ++ ++static void sm3_ce_hmac_init(struct hmac_sm3_ctx *hctx, const __u8 *key, size_t key_len) ++{ ++ sm3_hmac_key_padding(hctx, key, key_len); ++ ++ /* Ipadded key is the first block to hash in first cycle */ ++ sm3_ce_init(&hctx->sctx); ++ sm3_ce_update(&hctx->sctx, hctx->key, SM3_BLOCK_SIZE, sm3_ce_block_compress); ++} ++ ++static void sm3_ce_hmac_update(struct hmac_sm3_ctx *hctx, const __u8 *data, size_t data_len) ++{ ++ sm3_ce_update(&hctx->sctx, data, data_len, sm3_ce_block_compress); ++} ++ ++static void sm3_ce_hmac_final(struct hmac_sm3_ctx *hctx, __u8 *out_hmac) ++{ ++ __u8 digest[SM3_DIGEST_SIZE] = {0}; ++ size_t i; ++ ++ for (i = 0; i < SM3_BLOCK_SIZE; i++) { ++ hctx->key[i] ^= (IPAD_DATA ^ OPAD_DATA); ++ } ++ ++ /* Compute the last data from update process */ ++ sm3_ce_final(&hctx->sctx, digest, sm3_ce_block_compress); ++ ++ /* Opadded key is the first block to hash in second cycle */ ++ memset(&hctx->sctx, 0, sizeof(struct sm3_ce_ctx)); ++ sm3_ce_init(&hctx->sctx); ++ sm3_ce_update(&hctx->sctx, hctx->key, SM3_BLOCK_SIZE, sm3_ce_block_compress); ++ ++ /* Compute the the first cycle result */ ++ sm3_ce_update(&hctx->sctx, digest, SM3_DIGEST_SIZE, sm3_ce_block_compress); ++ sm3_ce_final(&hctx->sctx, out_hmac, sm3_ce_block_compress); ++} ++ ++static int do_hmac_sm3_ce(struct wd_digest_msg *msg, __u8 *out_hmac) ++{ ++ size_t data_len, key_len, iv_len; ++ enum hash_block_type block_type; ++ struct hmac_sm3_ctx hctx = {0}; ++ __u8 *data, *key, *iv; ++ ++ data_len = msg->in_bytes; ++ data = msg->in; ++ key = msg->key; ++ key_len = msg->key_bytes; ++ iv_len = SM3_DIGEST_SIZE; ++ /* Use last output as the iv in current cycle */ ++ iv = msg->out; ++ ++ if (!key_len) { ++ WD_ERR("invalid hmac key_len is 0!\n"); ++ return -WD_EINVAL; ++ } ++ ++ block_type = get_hash_block_type(msg); ++ switch(block_type) { ++ case HASH_SINGLE_BLOCK: ++ sm3_ce_hmac_init(&hctx, key, key_len); ++ sm3_ce_hmac_update(&hctx, data, data_len); ++ sm3_ce_hmac_final(&hctx, out_hmac); ++ break; ++ case HASH_FRIST_BLOCK: ++ sm3_ce_hmac_init(&hctx, key, key_len); ++ sm3_ce_hmac_update(&hctx, data, data_len); ++ trans_output_result(out_hmac, hctx.sctx.word_reg); ++ break; ++ case HASH_MIDDLE_BLOCK: ++ sm3_ce_init_ex(&(hctx.sctx), iv, iv_len); ++ sm3_ce_hmac_update(&hctx, data, data_len); ++ trans_output_result(out_hmac, hctx.sctx.word_reg); ++ break; ++ case HASH_END_BLOCK: ++ sm3_hmac_key_padding(&hctx, key, key_len); ++ sm3_ce_init_ex(&(hctx.sctx), iv, iv_len); ++ sm3_ce_hmac_update(&hctx, data, data_len); ++ hctx.sctx.nblocks = msg->long_data_len / SM3_BLOCK_SIZE + KEY_BLOCK_NUM; ++ sm3_ce_hmac_final(&hctx, out_hmac); ++ break; ++ default: ++ WD_ERR("Invalid block type!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (msg->out_bytes < SM3_DIGEST_SIZE) ++ memcpy(msg->out, out_hmac, msg->out_bytes); ++ else ++ memcpy(msg->out, out_hmac, SM3_DIGEST_SIZE); ++ ++ memset(&hctx, 0, sizeof(struct hmac_sm3_ctx)); ++ ++ return WD_SUCCESS; ++} ++ ++static int sm3_ce_drv_send(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg) ++{ ++ struct wd_digest_msg *msg = (struct wd_digest_msg *)digest_msg; ++ __u8 digest[SM3_DIGEST_SIZE] = {0}; ++ int ret; ++ ++ if (!msg) { ++ WD_ERR("invalid: digest_msg is NULL!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (msg->data_fmt == WD_SGL_BUF) { ++ WD_ERR("invalid: SM3 CE driver do not support sgl data format!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (msg->mode == WD_DIGEST_NORMAL) { ++ ret = do_sm3_ce(msg, digest); ++ } else if (msg->mode == WD_DIGEST_HMAC) { ++ ret = do_hmac_sm3_ce(msg, digest); ++ } else { ++ WD_ERR("invalid digest mode!\n"); ++ ret = -WD_EINVAL; ++ } ++ ++ return ret; ++} ++ ++static int sm3_ce_drv_recv(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg) ++{ ++ return WD_SUCCESS; ++} ++ ++static int sm3_ce_drv_init(struct wd_alg_driver *drv, void *conf) ++{ ++ struct wd_ctx_config_internal *config = (struct wd_ctx_config_internal *)conf; ++ struct sm3_ce_drv_ctx *sctx = (struct sm3_ce_drv_ctx *)drv->priv; ++ ++ config->epoll_en = false; ++ ++ /* return if already inited */ ++ if (sctx) ++ return WD_SUCCESS; ++ sctx = malloc(sizeof(struct sm3_ce_drv_ctx)); ++ if (!sctx) ++ return -WD_EINVAL; ++ ++ memcpy(&sctx->config, config, sizeof(struct wd_ctx_config_internal)); ++ ++ return WD_SUCCESS; ++} ++ ++static void sm3_ce_drv_exit(struct wd_alg_driver *drv) ++{ ++ struct sm3_ce_drv_ctx *sctx = (struct sm3_ce_drv_ctx *)drv->priv; ++ ++ if (!sctx) ++ return; ++ ++ free(sctx); ++ drv->priv = NULL; ++} +diff --git a/drv/isa_ce_sm3.h b/drv/isa_ce_sm3.h +new file mode 100644 +index 0000000..13edb0a +--- /dev/null ++++ b/drv/isa_ce_sm3.h +@@ -0,0 +1,86 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* Copyright 2020-2021 Huawei Technologies Co.,Ltd. All rights reserved. */ ++#ifndef __ISA_CE_SM3_H ++#define __ISA_CE_SM3_H ++ ++#include "wd_alg_common.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define SM3_DIGEST_SIZE 32 ++#define SM3_BLOCK_SIZE 64 ++#define SM3_STATE_WORDS 8 ++#define HMAC_BLOCK_SIZE 64 ++#define WORD_TO_CHAR_OFFSET 4 ++#define SM3_PADDING_BYTE 0x80 ++#define NH_OFFSET 23 ++#define BIT_TO_BLOCK_OFFSET 9 ++#define BIT_TO_BYTE_OFFSET 3 ++#define IPAD_DATA 0x36 ++#define OPAD_DATA 0x5c ++#define KEY_BLOCK_NUM 1 ++ ++#define SM3_IVA 0x7380166f ++#define SM3_IVB 0x4914b2b9 ++#define SM3_IVC 0x172442d7 ++#define SM3_IVD 0xda8a0600 ++#define SM3_IVE 0xa96f30bc ++#define SM3_IVF 0x163138aa ++#define SM3_IVG 0xe38dee4d ++#define SM3_IVH 0xb0fb0e4e ++ ++#define PUTU32_TO_U8(dst, src) \ ++ ((dst)[0] = (__u8)((src) >> 24), \ ++ (dst)[1] = (__u8)((src) >> 16), \ ++ (dst)[2] = (__u8)((src) >> 8), \ ++ (dst)[3] = (__u8)(src)) ++ ++#define PUTU8_TO_U32(dst, src) \ ++ ((dst) = (((__u32)(src)[0]) << 24) + \ ++ (((__u32)(src)[1]) << 16) + \ ++ (((__u32)(src)[2]) << 8) + \ ++ ((__u32)(src)[3])) ++ ++struct sm3_ce_ctx { ++ /* ++ * Use an array to represent the eight 32-bits word registers, ++ * SM3_IVA, SM3_IVB, ..., SM3_IVH, save IV and the final digest. ++ */ ++ __u32 word_reg[SM3_STATE_WORDS]; ++ /* ++ * The length (in bits) of all the msg fragments, the length of the ++ * whole msg should less than 2^64 bit, a msg block is 512-bits, ++ * make a 64-bits number in two parts, low 32-bits - 'Nl' and ++ * high 32-bits - 'Nh'. ++ */ ++ __u64 nblocks; ++ /* ++ * Message block, a msg block is 512-bits, use sixteen __u32 type ++ * element to store it, used in B(i) = W0||W1||W2||...||W15. ++ * Use a __u8 array to replace the 32-bit array. ++ */ ++ __u8 block[SM3_BLOCK_SIZE]; ++ /* The number of msg that need to compute in current cycle or turn. */ ++ size_t num; ++}; ++ ++struct hmac_sm3_ctx { ++ struct sm3_ce_ctx sctx; ++ /* Save user key */ ++ __u8 key[SM3_BLOCK_SIZE]; ++}; ++ ++struct sm3_ce_drv_ctx { ++ struct wd_ctx_config_internal config; ++}; ++ ++void sm3_ce_block_compress(__u32 word_reg[SM3_STATE_WORDS], ++ const __u8 *src, size_t blocks); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* __ISA_CE_SM3_H */ +diff --git a/drv/isa_ce_sm3_armv8.S b/drv/isa_ce_sm3_armv8.S +new file mode 100644 +index 0000000..3d08e2d +--- /dev/null ++++ b/drv/isa_ce_sm3_armv8.S +@@ -0,0 +1,765 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* ++ * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#include "../include/drv/arm_arch_ce.h" ++ ++.arch armv8.2-a ++.text ++.globl sm3_ce_block_compress ++.type sm3_ce_block_compress,%function ++.align 5 ++sm3_ce_block_compress: ++ AARCH64_VALID_CALL_TARGET ++/* Loads state */ ++ /* ++ * Loads multiple single-element structures from memory(X0 register) and ++ * writes result to two SIMD&FP registers(v5.4s and v6.4s). ++ */ ++ ld1 {v5.4s,v6.4s}, [x0] /* 4s -- 4 * 32bit */ ++ /* ++ * Reverses the order of 32-bit(type:s) elements in each doubleword of the ++ * vector in the src SIMD&FP register(v5), places the result into a vector ++ * and writes the vector to the dst SIDM&FP register(v5). ++ */ ++ rev64 v5.4s, v5.4s ++ rev64 v6.4s, v6.4s ++ /* ++ * Extracts the lowest vector elements from the second src SIMD&FP register, ++ * and highest vector elements from the first source SIMD&FP register, ++ * concatenates the result into a vector, and writes the vector to the ++ * dst SIMD&FP register vector. #8 means the numbered byte element to be extracted. ++ * Format: ext , , , ++ * #imm: immediate data. ++ */ ++ ext v5.16b, v5.16b, v5.16b, #8 /* 16b -- 16 * 8bit */ ++ ext v6.16b, v6.16b, v6.16b, #8 ++ /* From PC-relative address adds an immediate value to form a PC-relative ++ * address, and writes the result to the dst register. ++ */ ++ adr x8, .Tj /* 'Tj' is the constant defined in SM3 protocol */ ++ /* Loads pair of register calculates an address from a base register value ++ * and an immediate offset, loads two 32-bit words from memory, and writes ++ * them to two registers. */ ++ ldp s16, s17, [x8] /* 'sn' is the scalar register, 'vn' is the vector register */ ++ ++.Loop: ++/* Loads input */ ++ /* ++ * Loads multipule single-element structrue to four registers. ++ * #64 is the immediate offset variant, it is the post-index immediate offset. ++ * Loads the input src data, msg to be hashed. ++ */ ++ ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x1], #64 ++ /* ++ * Substracts an optionally-shifted immediate value from a register value, ++ * and writes the result to the dst register. ++ */ ++ sub w2, w2, #1 ++ ++ /* Copies the value in a src register to the dst register. */ ++ mov v18.16b, v5.16b ++ mov v19.16b, v6.16b ++ ++#ifndef __ARMEB__ ++ rev32 v0.16b, v0.16b ++ rev32 v1.16b, v1.16b ++ rev32 v2.16b, v2.16b ++ rev32 v3.16b, v3.16b ++#endif ++ ++ ext v20.16b, v16.16b, v16.16b, #4 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v4.16b, v1.16b, v2.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v0.16b, v1.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v2.16b, v3.16b, #8 ++ /* sm3partw1 v4.4s, v0.4s, v3.4s */ ++.inst 0xce63c004 ++ /* sm3partw2 v4.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e4 ++ eor v22.16b, v0.16b, v1.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5682e5 ++ /* sm3tt2a v6.4s, v23.4s, v0.4s[0] */ ++.inst 0xce408ae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5692e5 ++ /* sm3tt2a v6.4s, v23.4s, v0.4s[1] */ ++.inst 0xce409ae6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a2e5 ++ /* sm3tt2a v6.4s, v23.4s, v0.4s[2] */ ++.inst 0xce40aae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b2e5 ++ /* sm3tt2a v6.4s, v23.4s, v0.4s[3] */ ++.inst 0xce40bae6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v0.16b, v2.16b, v3.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v1.16b, v2.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v3.16b, v4.16b, #8 ++ /* sm3partw1 v0.4s, v1.4s, v4.4s */ ++.inst 0xce64c020 ++ /* sm3partw2 v0.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e0 ++ eor v22.16b, v1.16b, v2.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5682e5 ++ /* sm3tt2a v6.4s, v23.4s, v1.4s[0] */ ++.inst 0xce418ae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5692e5 ++ /* sm3tt2a v6.4s, v23.4s, v1.4s[1] */ ++.inst 0xce419ae6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a2e5 ++ /* sm3tt2a v6.4s, v23.4s, v1.4s[2] */ ++.inst 0xce41aae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b2e5 ++ /* sm3tt2a v6.4s, v23.4s, v1.4s[3] */ ++.inst 0xce41bae6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v1.16b, v3.16b, v4.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v2.16b, v3.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v4.16b, v0.16b, #8 ++ /* sm3partw1 v1.4s, v2.4s, v0.4s */ ++.inst 0xce60c041 ++ /* sm3partw2 v1.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e1 ++ eor v22.16b, v2.16b, v3.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5682e5 ++ /* sm3tt2a v6.4s, v23.4s, v2.4s[0] */ ++.inst 0xce428ae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5692e5 ++ /* sm3tt2a v6.4s, v23.4s, v2.4s[1] */ ++.inst 0xce429ae6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a2e5 ++ /* sm3tt2a v6.4s, v23.4s, v2.4s[2] */ ++.inst 0xce42aae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b2e5 ++ /* sm3tt2a v6.4s, v23.4s, v2.4s[3] */ ++.inst 0xce42bae6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v2.16b, v4.16b, v0.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v3.16b, v4.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v0.16b, v1.16b, #8 ++ /* sm3partw1 v2.4s, v3.4s, v1.4s */ ++.inst 0xce61c062 ++ /* sm3partw2 v2.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e2 ++ eor v22.16b, v3.16b, v4.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5682e5 ++ /* sm3tt2a v6.4s, v23.4s, v3.4s[0] */ ++.inst 0xce438ae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5692e5 ++ /* sm3tt2a v6.4s, v23.4s, v3.4s[1] */ ++.inst 0xce439ae6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a2e5 ++ /* sm3tt2a v6.4s, v23.4s, v3.4s[2] */ ++.inst 0xce43aae6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b2e5 ++ /* sm3tt2a v6.4s, v23.4s, v3.4s[3] */ ++.inst 0xce43bae6 ++ ext v20.16b, v17.16b, v17.16b, #4 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v3.16b, v0.16b, v1.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v4.16b, v0.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v1.16b, v2.16b, #8 ++ /* sm3partw1 v3.4s, v4.4s, v2.4s */ ++.inst 0xce62c083 ++ /* sm3partw2 v3.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e3 ++ eor v22.16b, v4.16b, v0.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[0] */ ++.inst 0xce448ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[1] */ ++.inst 0xce449ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[2] */ ++.inst 0xce44aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[3] */ ++.inst 0xce44bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v4.16b, v1.16b, v2.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v0.16b, v1.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v2.16b, v3.16b, #8 ++ /* sm3partw1 v4.4s, v0.4s, v3.4s */ ++.inst 0xce63c004 ++ /* sm3partw2 v4.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e4 ++ eor v22.16b, v0.16b, v1.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[0] */ ++.inst 0xce408ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[1] */ ++.inst 0xce409ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[2] */ ++.inst 0xce40aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[3] */ ++.inst 0xce40bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v0.16b, v2.16b, v3.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v1.16b, v2.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v3.16b, v4.16b, #8 ++ /* sm3partw1 v0.4s, v1.4s, v4.4s */ ++.inst 0xce64c020 ++ /* sm3partw2 v0.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e0 ++ eor v22.16b, v1.16b, v2.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[0] */ ++.inst 0xce418ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[1] */ ++.inst 0xce419ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[2] */ ++.inst 0xce41aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[3] */ ++.inst 0xce41bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v1.16b, v3.16b, v4.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v2.16b, v3.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v4.16b, v0.16b, #8 ++ /* sm3partw1 v1.4s, v2.4s, v0.4s */ ++.inst 0xce60c041 ++ /* sm3partw2 v1.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e1 ++ eor v22.16b, v2.16b, v3.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[0] */ ++.inst 0xce428ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[1] */ ++.inst 0xce429ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[2] */ ++.inst 0xce42aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[3] */ ++.inst 0xce42bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v2.16b, v4.16b, v0.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v3.16b, v4.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v0.16b, v1.16b, #8 ++ /* sm3partw1 v2.4s, v3.4s, v1.4s */ ++.inst 0xce61c062 ++ /* sm3partw2 v2.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e2 ++ eor v22.16b, v3.16b, v4.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[0] */ ++.inst 0xce438ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[1] */ ++.inst 0xce439ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[2] */ ++.inst 0xce43aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[3] */ ++.inst 0xce43bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v3.16b, v0.16b, v1.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v4.16b, v0.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v1.16b, v2.16b, #8 ++ /* sm3partw1 v3.4s, v4.4s, v2.4s */ ++.inst 0xce62c083 ++ /* sm3partw2 v3.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e3 ++ eor v22.16b, v4.16b, v0.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[0] */ ++.inst 0xce448ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[1] */ ++.inst 0xce449ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[2] */ ++.inst 0xce44aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[3] */ ++.inst 0xce44bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v4.16b, v1.16b, v2.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v0.16b, v1.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v2.16b, v3.16b, #8 ++ /* sm3partw1 v4.4s, v0.4s, v3.4s */ ++.inst 0xce63c004 ++ /* sm3partw2 v4.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e4 ++ eor v22.16b, v0.16b, v1.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[0] */ ++.inst 0xce408ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[1] */ ++.inst 0xce409ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[2] */ ++.inst 0xce40aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[3] */ ++.inst 0xce40bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v0.16b, v2.16b, v3.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v1.16b, v2.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v3.16b, v4.16b, #8 ++ /* sm3partw1 v0.4s, v1.4s, v4.4s */ ++.inst 0xce64c020 ++ /* sm3partw2 v0.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e0 ++ eor v22.16b, v1.16b, v2.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[0] */ ++.inst 0xce418ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[1] */ ++.inst 0xce419ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[2] */ ++.inst 0xce41aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v1.4s[3] */ ++.inst 0xce41bee6 ++ /* s4 = w7 | w8 | w9 | w10 */ ++ ext v1.16b, v3.16b, v4.16b, #12 ++ /* vtmp1 = w3 | w4 | w5 | w6 */ ++ ext v22.16b, v2.16b, v3.16b, #12 ++ /* vtmp2 = w10 | w11 | w12 | w13 */ ++ ext v23.16b, v4.16b, v0.16b, #8 ++ /* sm3partw1 v1.4s, v2.4s, v0.4s */ ++.inst 0xce60c041 ++ /* sm3partw2 v1.4s, v23.4s, v22.4s */ ++.inst 0xce76c6e1 ++ eor v22.16b, v2.16b, v3.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[0] */ ++.inst 0xce428ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[1] */ ++.inst 0xce429ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[2] */ ++.inst 0xce42aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v2.4s[3] */ ++.inst 0xce42bee6 ++ eor v22.16b, v3.16b, v4.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[0] */ ++.inst 0xce438ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[1] */ ++.inst 0xce439ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[2] */ ++.inst 0xce43aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v3.4s[3] */ ++.inst 0xce43bee6 ++ eor v22.16b, v4.16b, v0.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[0] */ ++.inst 0xce448ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[1] */ ++.inst 0xce449ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[2] */ ++.inst 0xce44aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v4.4s[3] */ ++.inst 0xce44bee6 ++ eor v22.16b, v0.16b, v1.16b ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ ++.inst 0xce5686e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[0] */ ++.inst 0xce408ee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ ++.inst 0xce5696e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[1] */ ++.inst 0xce409ee6 ++ /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ ++.inst 0xce5418b7 ++ shl v21.4s, v20.4s, #1 ++ sri v21.4s, v20.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ ++.inst 0xce56a6e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[2] */ ++.inst 0xce40aee6 ++ /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ ++.inst 0xce5518b7 ++ shl v20.4s, v21.4s, #1 ++ sri v20.4s, v21.4s, #31 ++ /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ ++.inst 0xce56b6e5 ++ /* sm3tt2b v6.4s, v23.4s, v0.4s[3] */ ++.inst 0xce40bee6 ++ eor v5.16b, v5.16b, v18.16b ++ eor v6.16b, v6.16b, v19.16b ++ /* ++ * cbnz: compare and branch on Nonzero, compares the value in a register ++ * with zero, and conditionally branches to a label at a PC-relative offset ++ * if the comparison is not equal. ++ * 'w2' is the 32-bit name of the general-purpose register to be tested. ++ * '.Loop' is the program label to be conditionally branched to. ++ */ ++ cbnz w2, .Loop ++ ++ /* save state, it is the result of one cycle */ ++ rev64 v5.4s, v5.4s ++ rev64 v6.4s, v6.4s ++ ext v5.16b, v5.16b, v5.16b, #8 ++ ext v6.16b, v6.16b, v6.16b, #8 ++ st1 {v5.4s,v6.4s}, [x0] ++ ret ++.size sm3_ce_block_compress,.-sm3_ce_block_compress ++ ++.align 3 ++.Tj: ++/* ++ * Inserts a list of 32-bit values as data into the assembly. ++ * In SM3 protocol: ++ * when 0 <= j <= 15, Tj = 0x79cc4519, ++ * when 16 <= j <= 63, Tj = 0x9d8a7a87. ++ */ ++.word 0x79cc4519, 0x9d8a7a87 +diff --git a/include/drv/arm_arch_ce.h b/include/drv/arm_arch_ce.h +new file mode 100644 +index 0000000..3ea81a4 +--- /dev/null ++++ b/include/drv/arm_arch_ce.h +@@ -0,0 +1,199 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* ++ * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#ifndef __ARM_ARCH_CE_H ++#define __ARM_ARCH_CE_H ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#if !defined(__ARM_ARCH__) ++# if defined(__CC_ARM) ++# define __ARM_ARCH__ __TARGET_ARCH_ARM ++# if defined(__BIG_ENDIAN) ++# define __ARMEB__ ++# else ++# define __ARMEL__ ++# endif ++# elif defined(__GNUC__) ++# if defined(__aarch64__) ++# define __ARM_ARCH__ 8 ++ /* ++ * GCC does not define __ARM_ARCH__, instead it defines ++ * bunch of below macros. See all_architectures[] table in ++ * gcc/config/arm/arm.c. ++ */ ++# elif defined(__ARM_ARCH) ++# define __ARM_ARCH__ __ARM_ARCH ++# elif defined(__ARM_ARCH_8A__) ++# define __ARM_ARCH__ 8 ++# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ ++ defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ ++ defined(__ARM_ARCH_7EM__) ++# define __ARM_ARCH__ 7 ++# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ ++ defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6M__) || \ ++ defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ ++ defined(__ARM_ARCH_6T2__) ++# define __ARM_ARCH__ 6 ++# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ ++ defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ ++ defined(__ARM_ARCH_5TEJ__) ++# define __ARM_ARCH__ 5 ++# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) ++# define __ARM_ARCH__ 4 ++# else ++# error "unsupported ARM architecture" ++# endif ++# endif ++#endif ++ ++#if !defined(__ARM_MAX_ARCH__) ++# define __ARM_MAX_ARCH__ __ARM_ARCH__ ++#endif ++ ++#if __ARM_MAX_ARCH__ < __ARM_ARCH__ ++# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" ++#elif __ARM_MAX_ARCH__ != __ARM_ARCH__ ++# if __ARM_ARCH__ < 7 && __ARM_MAX_ARCH__ >= 7 && defined(__ARMEB__) ++# error "can't build universal big-endian binary" ++# endif ++#endif ++ ++#ifndef __ASSEMBLER__ ++extern unsigned int ARMCAP_P; ++extern unsigned int ARM_MIDR; ++#endif ++ ++#define ARMV7_NEON (1<<0) ++#define ARMV7_TICK (1<<1) ++#define ARMV8_AES (1<<2) ++#define ARMV8_SHA1 (1<<3) ++#define ARMV8_SHA256 (1<<4) ++#define ARMV8_PMULL (1<<5) ++#define ARMV8_SHA512 (1<<6) ++#define ARMV8_CPUID (1<<7) ++#define ARMV8_RNG (1<<8) ++#define ARMV8_SM3 (1<<9) ++#define ARMV8_SM4 (1<<10) ++#define ARMV8_SHA3 (1<<11) ++#define ARMV8_UNROLL8_EOR3 (1<<12) ++#define ARMV8_SVE (1<<13) ++#define ARMV8_SVE2 (1<<14) ++ ++/* ++ * MIDR_EL1 system register ++ * ++ * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 ++ * | | | | | | | ++ * |RES0 | Implementer | Variant | Arch | PartNum |Revision| ++ * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| ++ * ++ */ ++ ++#define ARM_CPU_IMP_ARM 0x41 ++#define HISI_CPU_IMP 0x48 ++ ++#define ARM_CPU_PART_CORTEX_A72 0xD08 ++#define ARM_CPU_PART_N1 0xD0C ++#define ARM_CPU_PART_V1 0xD40 ++#define ARM_CPU_PART_N2 0xD49 ++#define HISI_CPU_PART_KP920 0xD01 ++ ++#define MIDR_PARTNUM_SHIFT 4 ++#define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) ++#define MIDR_PARTNUM(midr) \ ++ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) ++ ++#define MIDR_IMPLEMENTER_SHIFT 24 ++#define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) ++#define MIDR_IMPLEMENTER(midr) \ ++ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) ++ ++#define MIDR_ARCHITECTURE_SHIFT 16 ++#define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) ++#define MIDR_ARCHITECTURE(midr) \ ++ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) ++ ++#define MIDR_CPU_MODEL_MASK \ ++ (MIDR_IMPLEMENTER_MASK | \ ++ MIDR_PARTNUM_MASK | \ ++ MIDR_ARCHITECTURE_MASK) ++ ++#define MIDR_CPU_MODEL(imp, partnum) \ ++ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ ++ (0xfU << MIDR_ARCHITECTURE_SHIFT) | \ ++ ((partnum) << MIDR_PARTNUM_SHIFT)) ++ ++#define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ ++ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) ++ ++#if defined(__ASSEMBLER__) ++ /* ++ * Support macros for ++ * - Armv8.3-A Pointer Authentication and ++ * - Armv8.5-A Branch Target Identification ++ * features which require emitting a .note.gnu.property section with the ++ * appropriate architecture-dependent feature bits set. ++ * Read more: "ELF for the Arm?? 64-bit Architecture" ++ */ ++# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 ++# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ ++# else ++# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET ++# endif ++ ++# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ ++ (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ ++# define GNU_PROPERTY_AARCH64_POINTER_AUTH (1 << 1) /* Has Pointer Authentication */ ++# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ ++# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ ++# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ ++ (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ ++# define GNU_PROPERTY_AARCH64_POINTER_AUTH (1 << 1) /* Has Pointer Authentication */ ++# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ ++# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ ++# else ++# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ ++# if GNU_PROPERTY_AARCH64_BTI != 0 ++# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET ++# else ++# define AARCH64_SIGN_LINK_REGISTER ++# endif ++# define AARCH64_VALIDATE_LINK_REGISTER ++# endif ++ ++# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 ++ .pushsection .note.gnu.property, "a"; ++ .balign 8; ++ .long 4; ++ .long 0x10; ++ .long 0x5; ++ .asciz "GNU"; ++ .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ ++ .long 4; ++ .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); ++ .long 0; ++ .popsection; ++# endif ++ ++#endif /* defined __ASSEMBLER__ */ ++ ++#define IS_CPU_SUPPORT_UNROLL8_EOR3() \ ++ (ARMCAP_P & ARMV8_UNROLL8_EOR3) ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* __ARM_ARCH_CE_H */ +diff --git a/include/wd_alg.h b/include/wd_alg.h +index f8b136e..861b7d9 100644 +--- a/include/wd_alg.h ++++ b/include/wd_alg.h +@@ -19,6 +19,49 @@ extern "C" { + #define ALG_NAME_SIZE 128 + #define DEV_NAME_LEN 128 + ++/* ++ * Macros related to arm platform: ++ * ARM puts the feature bits for Crypto Extensions in AT_HWCAP2, whereas ++ * AArch64 used AT_HWCAP. ++ */ ++#ifndef AT_HWCAP ++# define AT_HWCAP 16 ++#endif ++ ++#ifndef AT_HWCAP2 ++# define AT_HWCAP2 26 ++#endif ++ ++#if defined(__arm__) || defined(__arm) ++# define HWCAP AT_HWCAP ++# define HWCAP_NEON (1 << 12) ++ ++# define HWCAP_CE AT_HWCAP2 ++# define HWCAP_CE_AES (1 << 0) ++# define HWCAP_CE_PMULL (1 << 1) ++# define HWCAP_CE_SHA1 (1 << 2) ++# define HWCAP_CE_SHA256 (1 << 3) ++#elif defined(__aarch64__) ++# define HWCAP AT_HWCAP ++# define HWCAP_NEON (1 << 1) ++ ++# define HWCAP_CE HWCAP ++# define HWCAP_CE_AES (1 << 3) ++# define HWCAP_CE_PMULL (1 << 4) ++# define HWCAP_CE_SHA1 (1 << 5) ++# define HWCAP_CE_SHA256 (1 << 6) ++# define HWCAP_CPUID (1 << 11) ++# define HWCAP_SHA3 (1 << 17) ++# define HWCAP_CE_SM3 (1 << 18) ++# define HWCAP_CE_SM4 (1 << 19) ++# define HWCAP_CE_SHA512 (1 << 21) ++# define HWCAP_SVE (1 << 22) ++/* AT_HWCAP2 */ ++# define HWCAP2 26 ++# define HWCAP2_SVE2 (1 << 1) ++# define HWCAP2_RNG (1 << 16) ++#endif ++ + enum alg_dev_type { + UADK_ALG_SOFT = 0x0, + UADK_ALG_CE_INSTR = 0x1, +diff --git a/wd_alg.c b/wd_alg.c +index 3b111c8..f34a407 100644 +--- a/wd_alg.c ++++ b/wd_alg.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + + #include "wd.h" + #include "wd_alg.h" +@@ -90,6 +91,24 @@ static bool wd_check_accel_dev(const char *dev_name) + return false; + } + ++static bool wd_check_ce_support(const char *dev_name) ++{ ++ unsigned long hwcaps = 0; ++ ++ #if defined(__arm__) || defined(__arm) ++ hwcaps = getauxval(AT_HWCAP2); ++ #elif defined(__aarch64__) ++ hwcaps = getauxval(AT_HWCAP); ++ #endif ++ if (!strcmp("isa_ce_sm3", dev_name) && (hwcaps & HWCAP_CE_SM3)) ++ return true; ++ ++ if (!strcmp("isa_ce_sm4", dev_name) && (hwcaps & HWCAP_CE_SM4)) ++ return true; ++ ++ return false; ++} ++ + static bool wd_alg_check_available(int calc_type, const char *dev_name) + { + bool ret = false; +@@ -99,6 +118,7 @@ static bool wd_alg_check_available(int calc_type, const char *dev_name) + break; + /* Should find the CPU if not support CE */ + case UADK_ALG_CE_INSTR: ++ ret = wd_check_ce_support(dev_name); + break; + /* Should find the CPU if not support SVE */ + case UADK_ALG_SVE_INSTR: +@@ -280,8 +300,13 @@ struct wd_alg_driver *wd_request_drv(const char *alg_name, bool hw_mask) + struct wd_alg_driver *drv = NULL; + int tmp_priority = -1; + +- if (!pnext || !alg_name) { +- WD_ERR("invalid: request alg param is error!\n"); ++ if (!pnext) { ++ WD_ERR("invalid: requset drv pnext is NULL!\n"); ++ return NULL; ++ } ++ ++ if (!alg_name) { ++ WD_ERR("invalid: alg_name is NULL!\n"); + return NULL; + } + +@@ -289,7 +314,8 @@ struct wd_alg_driver *wd_request_drv(const char *alg_name, bool hw_mask) + pthread_mutex_lock(&mutex); + while (pnext) { + /* hw_mask true mean not to used hardware dev */ +- if (hw_mask && pnext->drv->calc_type == UADK_ALG_HW) { ++ if ((hw_mask && pnext->drv->calc_type == UADK_ALG_HW) || ++ (!hw_mask && pnext->drv->calc_type != UADK_ALG_HW)) { + pnext = pnext->next; + continue; + } +diff --git a/wd_digest.c b/wd_digest.c +index c59184d..491502a 100644 +--- a/wd_digest.c ++++ b/wd_digest.c +@@ -222,7 +222,7 @@ static void wd_digest_clear_status(void) + } + + static int wd_digest_init_nolock(struct wd_ctx_config *config, +- struct wd_sched *sched) ++ struct wd_sched *sched) + { + int ret; + +diff --git a/wd_sched.c b/wd_sched.c +index 419280e..b43834d 100644 +--- a/wd_sched.c ++++ b/wd_sched.c +@@ -453,7 +453,7 @@ static struct wd_sched sched_table[SCHED_POLICY_BUTT] = { + .poll_policy = session_sched_poll_policy, + }, { + .name = "None scheduler", +- .sched_policy = SCHED_POLICY_SINGLE, ++ .sched_policy = SCHED_POLICY_NONE, + .sched_init = sched_none_init, + .pick_next_ctx = sched_none_pick_next_ctx, + .poll_policy = sched_none_poll_policy, +diff --git a/wd_util.c b/wd_util.c +index 6134239..39909ca 100644 +--- a/wd_util.c ++++ b/wd_util.c +@@ -91,6 +91,11 @@ struct acc_alg_item { + char *algtype; + }; + ++struct wd_ce_ctx { ++ char *drv_name; ++ void *priv; ++}; ++ + static struct acc_alg_item alg_options[] = { + {"zlib", "zlib"}, + {"gzip", "gzip"}, +@@ -229,7 +234,6 @@ int wd_init_ctx_config(struct wd_ctx_config_internal *in, + ret = -WD_EINVAL; + goto err_out; + } +- + clone_ctx_to_internal(cfg->ctxs + i, ctxs + i); + ret = pthread_spin_init(&ctxs[i].lock, PTHREAD_PROCESS_SHARED); + if (ret) { +@@ -2612,14 +2616,44 @@ out_freelist: + return ret; + } + ++static int wd_alg_ce_ctx_init(struct wd_init_attrs *attrs) ++{ ++ struct wd_ctx_config *ctx_config = attrs->ctx_config; ++ ++ ctx_config->ctx_num = 1; ++ ctx_config->ctxs = calloc(ctx_config->ctx_num, sizeof(struct wd_ctx)); ++ if (!ctx_config->ctxs) { ++ return -WD_ENOMEM; ++ WD_ERR("failed to alloc ctxs!\n"); ++ } ++ ctx_config->ctxs[0].ctx = (handle_t)calloc(1, sizeof(struct wd_ce_ctx)); ++ ++ return WD_SUCCESS; ++} ++ ++static void wd_alg_ce_ctx_uninit(struct wd_ctx_config *ctx_config) ++{ ++ __u32 i; ++ ++ for (i = 0; i < ctx_config->ctx_num; i++) { ++ if (ctx_config->ctxs[i].ctx) { ++ free((struct wd_ce_ctx *)ctx_config->ctxs[i].ctx); ++ ctx_config->ctxs[i].ctx = 0; ++ } ++ } ++ ++ free(ctx_config->ctxs); ++} ++ + static void wd_alg_ctx_uninit(struct wd_ctx_config *ctx_config) + { + __u32 i; + +- for (i = 0; i < ctx_config->ctx_num; i++) ++ for (i = 0; i < ctx_config->ctx_num; i++) { + if (ctx_config->ctxs[i].ctx) { + wd_release_ctx(ctx_config->ctxs[i].ctx); + ctx_config->ctxs[i].ctx = 0; ++ } + } + + free(ctx_config->ctxs); +@@ -2633,9 +2667,9 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + struct wd_ctx_config *ctx_config = NULL; + struct wd_sched *alg_sched = NULL; + char alg_type[CRYPTO_MAX_ALG_NAME]; +- char *alg = attrs->alg; + int driver_type = UADK_ALG_HW; +- int ret; ++ char *alg = attrs->alg; ++ int ret = 0; + + if (!attrs->ctx_params) + return -WD_EINVAL; +@@ -2646,22 +2680,37 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + switch (driver_type) { + case UADK_ALG_SOFT: + case UADK_ALG_CE_INSTR: +- /* No need to alloc resource */ +- if (sched_type != SCHED_POLICY_NONE) ++ /* No need to alloc resource */ ++ if (sched_type != SCHED_POLICY_NONE) { ++ WD_ERR("invalid sched_type\n"); + return -WD_EINVAL; ++ } ++ ++ ctx_config = calloc(1, sizeof(*ctx_config)); ++ if (!ctx_config) { ++ WD_ERR("fail to alloc ctx config\n"); ++ return -WD_ENOMEM; ++ } ++ attrs->ctx_config = ctx_config; + + alg_sched = wd_sched_rr_alloc(SCHED_POLICY_NONE, 1, 1, alg_poll_func); + if (!alg_sched) { + WD_ERR("fail to alloc scheduler\n"); +- return -WD_EINVAL; ++ goto out_ctx_config; + } ++ + attrs->sched = alg_sched; + +- ret = wd_sched_rr_instance(alg_sched, NULL); ++ ret = wd_alg_ce_ctx_init(attrs); + if (ret) { +- WD_ERR("fail to instance scheduler\n"); ++ WD_ERR("fail to init ce ctx\n"); + goto out_freesched; + } ++ ++ ret = alg_init_func(ctx_config, alg_sched); ++ if (ret) ++ goto out_pre_init; ++ + break; + case UADK_ALG_SVE_INSTR: + /* Todo lock cpu core */ +@@ -2720,7 +2769,10 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + return 0; + + out_pre_init: +- wd_alg_ctx_uninit(ctx_config); ++ if (driver_type == UADK_ALG_CE_INSTR || driver_type == UADK_ALG_SOFT) ++ wd_alg_ce_ctx_uninit(ctx_config); ++ else ++ wd_alg_ctx_uninit(ctx_config); + out_freesched: + wd_sched_rr_release(alg_sched); + out_ctx_config: +@@ -2733,10 +2785,19 @@ void wd_alg_attrs_uninit(struct wd_init_attrs *attrs) + { + struct wd_ctx_config *ctx_config = attrs->ctx_config; + struct wd_sched *alg_sched = attrs->sched; ++ int driver_type = attrs->driver->calc_type; + +- if (ctx_config) { +- wd_alg_ctx_uninit(ctx_config); +- free(ctx_config); ++ if (driver_type == UADK_ALG_CE_INSTR || driver_type == UADK_ALG_SOFT) { ++ if (ctx_config) { ++ wd_alg_ce_ctx_uninit(ctx_config); ++ free(ctx_config); ++ } ++ } else { ++ if (ctx_config) { ++ wd_alg_ctx_uninit(ctx_config); ++ free(ctx_config); ++ } + } ++ + wd_sched_rr_release(alg_sched); + } +-- +2.25.1 + diff --git a/0014-uadk-fix-control-range-of-environmemt-variable.patch b/0014-uadk-fix-control-range-of-environmemt-variable.patch new file mode 100644 index 0000000..4b90bba --- /dev/null +++ b/0014-uadk-fix-control-range-of-environmemt-variable.patch @@ -0,0 +1,33 @@ +From 7869d42227f03754e4117a17751e6959b3f93bca Mon Sep 17 00:00:00 2001 +From: Zhiqi Song +Date: Mon, 11 Mar 2024 18:07:23 +0800 +Subject: [PATCH 14/44] uadk: fix control range of environmemt variable + +Environment variable will not be used in non-hardware +situation to config the ctx num. So add an interception +condition to avoid the impact of environment variables +on the initialization of non-hardware situations. + +Signed-off-by: Zhiqi Song +--- + wd_util.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/wd_util.c b/wd_util.c +index 39909ca..b8e755c 100644 +--- a/wd_util.c ++++ b/wd_util.c +@@ -2111,8 +2111,9 @@ int wd_ctx_param_init(struct wd_ctx_params *ctx_params, + return -WD_ENOMEM; + } + ++ /* Only hw driver support environment variable */ + var_s = secure_getenv(env_name); +- if (var_s && strlen(var_s)) { ++ if (var_s && strlen(var_s) && driver->calc_type == UADK_ALG_HW) { + /* environment variable has the highest priority */ + ret = wd_env_set_ctx_nums(driver->alg_name, env_name, var_s, + ctx_params, max_op_type); +-- +2.25.1 + diff --git a/0015-uadk-util-use-default-sched_type-for-instruction-tas.patch b/0015-uadk-util-use-default-sched_type-for-instruction-tas.patch new file mode 100644 index 0000000..c22b859 --- /dev/null +++ b/0015-uadk-util-use-default-sched_type-for-instruction-tas.patch @@ -0,0 +1,72 @@ +From 1839b896bbb7cfaddbd8b19d322465c7ef7e185c Mon Sep 17 00:00:00 2001 +From: Zhiqi Song +Date: Mon, 11 Mar 2024 18:07:24 +0800 +Subject: [PATCH 15/44] uadk/util: use default sched_type for instruction task + +To prevent users from perceiving the difference in instruction +acceleration task, no longer check the specific sched_type, just +accept sched_type within the valid range from user, and use +default sched_type inside. + +As sched_type is checked before init2 calls wd_alg_attrs_init(). +Redundancy check is not needed. + +Signed-off-by: Zhiqi Song +--- + include/wd_sched.h | 2 +- + wd_util.c | 12 ++---------- + 2 files changed, 3 insertions(+), 11 deletions(-) + +diff --git a/include/wd_sched.h b/include/wd_sched.h +index b145172..be541c6 100644 +--- a/include/wd_sched.h ++++ b/include/wd_sched.h +@@ -21,7 +21,7 @@ enum sched_policy_type { + SCHED_POLICY_RR = 0, + /* requests will no need ctxs */ + SCHED_POLICY_NONE, +- /* requests will need a fixed ctx */ ++ /* requests will need a fixed ctx */ + SCHED_POLICY_SINGLE, + SCHED_POLICY_BUTT, + }; +diff --git a/wd_util.c b/wd_util.c +index b8e755c..fb58167 100644 +--- a/wd_util.c ++++ b/wd_util.c +@@ -2681,12 +2681,6 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + switch (driver_type) { + case UADK_ALG_SOFT: + case UADK_ALG_CE_INSTR: +- /* No need to alloc resource */ +- if (sched_type != SCHED_POLICY_NONE) { +- WD_ERR("invalid sched_type\n"); +- return -WD_EINVAL; +- } +- + ctx_config = calloc(1, sizeof(*ctx_config)); + if (!ctx_config) { + WD_ERR("fail to alloc ctx config\n"); +@@ -2694,6 +2688,7 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + } + attrs->ctx_config = ctx_config; + ++ /* Use default sched_type to alloc scheduler */ + alg_sched = wd_sched_rr_alloc(SCHED_POLICY_NONE, 1, 1, alg_poll_func); + if (!alg_sched) { + WD_ERR("fail to alloc scheduler\n"); +@@ -2714,10 +2709,7 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + + break; + case UADK_ALG_SVE_INSTR: +- /* Todo lock cpu core */ +- if (sched_type != SCHED_POLICY_SINGLE) +- return -WD_EINVAL; +- ++ /* Use default sched_type to alloc scheduler */ + alg_sched = wd_sched_rr_alloc(SCHED_POLICY_SINGLE, 1, 1, alg_poll_func); + if (!alg_sched) { + WD_ERR("fail to alloc scheduler\n"); +-- +2.25.1 + diff --git a/0016-uadk-digest-modify-spelling-errors.patch b/0016-uadk-digest-modify-spelling-errors.patch new file mode 100644 index 0000000..665f6f1 --- /dev/null +++ b/0016-uadk-digest-modify-spelling-errors.patch @@ -0,0 +1,184 @@ +From f7d1cbe8850ceae6de4aed1fd5fa81f029da753f Mon Sep 17 00:00:00 2001 +From: Zhiqi Song +Date: Fri, 15 Mar 2024 15:22:06 +0800 +Subject: [PATCH 16/44] uadk/digest: modify spelling errors + +Modify spelling errors related to digest stream mode. + +Signed-off-by: Zhiqi Song +--- + drv/hisi_sec.c | 8 ++++---- + drv/isa_ce_sm3.c | 4 ++-- + include/drv/wd_digest_drv.h | 6 +++--- + v1/test/hisi_sec_test/test_hisi_sec.c | 5 ++--- + v1/test/hisi_sec_test_sgl/test_hisi_sec_sgl.c | 5 ++--- + wd_digest.c | 2 +- + 6 files changed, 14 insertions(+), 16 deletions(-) + +diff --git a/drv/hisi_sec.c b/drv/hisi_sec.c +index 9da21a8..ac62109 100644 +--- a/drv/hisi_sec.c ++++ b/drv/hisi_sec.c +@@ -1553,7 +1553,7 @@ static int fill_digest_long_hash(handle_t h_qp, struct wd_digest_msg *msg, + if (ret) + return ret; + +- if (block_type == HASH_FRIST_BLOCK) { ++ if (block_type == HASH_FIRST_BLOCK) { + /* Long hash first */ + sqe->ai_apd_cs = AI_GEN_INNER; + sqe->ai_apd_cs |= AUTHPAD_NOPAD << AUTHPAD_OFFSET; +@@ -1635,7 +1635,7 @@ static int digest_bd2_type_check(struct wd_digest_msg *msg) + enum hash_block_type type = get_hash_block_type(msg); + + /* Long hash first and middle bd */ +- if (type == HASH_FRIST_BLOCK || type == HASH_MIDDLE_BLOCK) { ++ if (type == HASH_FIRST_BLOCK || type == HASH_MIDDLE_BLOCK) { + WD_ERR("hardware v2 not supports 0 size in long hash!\n"); + return -WD_EINVAL; + } +@@ -1653,7 +1653,7 @@ static int digest_bd3_type_check(struct wd_digest_msg *msg) + { + enum hash_block_type type = get_hash_block_type(msg); + /* Long hash first and middle bd */ +- if (type == HASH_FRIST_BLOCK || type == HASH_MIDDLE_BLOCK) { ++ if (type == HASH_FIRST_BLOCK || type == HASH_MIDDLE_BLOCK) { + WD_ERR("invalid: hardware v3 not supports 0 size in long hash!\n"); + return -WD_EINVAL; + } +@@ -1906,7 +1906,7 @@ static int fill_digest_long_hash3(handle_t h_qp, struct wd_digest_msg *msg, + if (ret) + return ret; + +- if (block_type == HASH_FRIST_BLOCK) { ++ if (block_type == HASH_FIRST_BLOCK) { + /* Long hash first */ + sqe->auth_mac_key |= AI_GEN_INNER << SEC_AI_GEN_OFFSET_V3; + sqe->stream_scene.stream_auth_pad = AUTHPAD_NOPAD; +diff --git a/drv/isa_ce_sm3.c b/drv/isa_ce_sm3.c +index f16bdd3..0309861 100644 +--- a/drv/isa_ce_sm3.c ++++ b/drv/isa_ce_sm3.c +@@ -187,7 +187,7 @@ static int do_sm3_ce(struct wd_digest_msg *msg, __u8 *out_digest) + sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); + sm3_ce_final(&sctx, out_digest, sm3_ce_block_compress); + break; +- case HASH_FRIST_BLOCK: ++ case HASH_FIRST_BLOCK: + sm3_ce_init(&sctx); + sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); + trans_output_result(out_digest, sctx.word_reg); +@@ -305,7 +305,7 @@ static int do_hmac_sm3_ce(struct wd_digest_msg *msg, __u8 *out_hmac) + sm3_ce_hmac_update(&hctx, data, data_len); + sm3_ce_hmac_final(&hctx, out_hmac); + break; +- case HASH_FRIST_BLOCK: ++ case HASH_FIRST_BLOCK: + sm3_ce_hmac_init(&hctx, key, key_len); + sm3_ce_hmac_update(&hctx, data, data_len); + trans_output_result(out_hmac, hctx.sctx.word_reg); +diff --git a/include/drv/wd_digest_drv.h b/include/drv/wd_digest_drv.h +index 8a4aa0b..a55ef5b 100644 +--- a/include/drv/wd_digest_drv.h ++++ b/include/drv/wd_digest_drv.h +@@ -11,7 +11,7 @@ extern "C" { + #endif + + enum hash_block_type { +- HASH_FRIST_BLOCK, ++ HASH_FIRST_BLOCK, + HASH_MIDDLE_BLOCK, + HASH_END_BLOCK, + HASH_SINGLE_BLOCK, +@@ -65,13 +65,13 @@ static inline enum hash_block_type get_hash_block_type(struct wd_digest_msg *msg + { + /* + * [has_next , iv_bytes] +- * [ 1 , 0 ] = long hash(frist bd) ++ * [ 1 , 0 ] = long hash(first bd) + * [ 1 , 1 ] = long hash(middle bd) + * [ 0 , 1 ] = long hash(end bd) + * [ 0 , 0 ] = block hash(single bd) + */ + if (msg->has_next && !msg->iv_bytes) +- return HASH_FRIST_BLOCK; ++ return HASH_FIRST_BLOCK; + else if (msg->has_next && msg->iv_bytes) + return HASH_MIDDLE_BLOCK; + else if (!msg->has_next && msg->iv_bytes) +diff --git a/v1/test/hisi_sec_test/test_hisi_sec.c b/v1/test/hisi_sec_test/test_hisi_sec.c +index 824fe9e..7d94332 100644 +--- a/v1/test/hisi_sec_test/test_hisi_sec.c ++++ b/v1/test/hisi_sec_test/test_hisi_sec.c +@@ -1462,7 +1462,7 @@ static int sec_cipher_async_test(int thread_num, __u64 lcore_mask, + SEC_TST_PRT("%s(): create pool fail!\n", __func__); + return -ENOMEM; + } +- /* frist create the async poll thread! */ ++ /* first create the async poll thread! */ + test_thrds_data[0].pool = pool; + test_thrds_data[0].q = &q; + test_thrds_data[0].thread_num = 1; +@@ -2069,7 +2069,7 @@ static int sec_aead_async_test(int thd_num, __u64 lcore_mask, + SEC_TST_PRT("%s(): create pool fail!\n", __func__); + return -ENOMEM; + } +- /* frist create the async poll thread! */ ++ /* first create the async poll thread! */ + test_thrds_data[0].pool = pool; + test_thrds_data[0].q = &q; + test_thrds_data[0].thread_num = 1; +@@ -2082,7 +2082,6 @@ static int sec_aead_async_test(int thd_num, __u64 lcore_mask, + return ret; + } + +- //Ïß³ÌÊý Óë°óºË + if (_get_one_bits(lcore_mask) == 0 && + _get_one_bits(hcore_mask) == 0) + cnt = thd_num; +diff --git a/v1/test/hisi_sec_test_sgl/test_hisi_sec_sgl.c b/v1/test/hisi_sec_test_sgl/test_hisi_sec_sgl.c +index b7513d1..b13915f 100644 +--- a/v1/test/hisi_sec_test_sgl/test_hisi_sec_sgl.c ++++ b/v1/test/hisi_sec_test_sgl/test_hisi_sec_sgl.c +@@ -1733,7 +1733,7 @@ static int sec_cipher_async_test(int thread_num, __u64 lcore_mask, + SEC_TST_PRT("%s(): create pool fail!\n", __func__); + return -ENOMEM; + } +- /* frist create the async poll thread! */ ++ /* first create the async poll thread! */ + test_thrds_data[0].pool = pool; + test_thrds_data[0].q = &q; + test_thrds_data[0].thread_num = 1; +@@ -2640,7 +2640,7 @@ static int sec_aead_async_test(int thd_num, __u64 lcore_mask, + return -ENOMEM; + } + +- /* frist create the async poll thread! */ ++ /* first create the async poll thread! */ + test_thrds_data[0].pool = pool; + test_thrds_data[0].q = &q; + test_thrds_data[0].thread_num = 1; +@@ -2654,7 +2654,6 @@ static int sec_aead_async_test(int thd_num, __u64 lcore_mask, + return ret; + } + +- //Ïß³ÌÊý Óë°óºË + if (_get_one_bits(lcore_mask) == 0 && + _get_one_bits(hcore_mask) == 0) + cnt = thd_num; +diff --git a/wd_digest.c b/wd_digest.c +index 491502a..10ac080 100644 +--- a/wd_digest.c ++++ b/wd_digest.c +@@ -53,7 +53,7 @@ struct wd_digest_stream_data { + /* Total data length for stream mode */ + __u64 long_data_len; + /* +- * Notify the stream message state, zero is frist message, ++ * Notify the stream message state, zero is first message, + * non-zero is middle or final message. + */ + int msg_state; +-- +2.25.1 + diff --git a/0017-uadk-drv-hisi-fix-failed-to-init-drv-after-fork.patch b/0017-uadk-drv-hisi-fix-failed-to-init-drv-after-fork.patch new file mode 100644 index 0000000..992f1a7 --- /dev/null +++ b/0017-uadk-drv-hisi-fix-failed-to-init-drv-after-fork.patch @@ -0,0 +1,152 @@ +From 6a6831101e99323fc5e9b63baa7e86ae8ac244ee Mon Sep 17 00:00:00 2001 +From: Yang Shen +Date: Thu, 22 Feb 2024 15:23:33 +0800 +Subject: [PATCH 17/44] uadk: drv/hisi - fix failed to init drv after fork + +The drivers initialization function use 'drv.priv' to forbid reinit. +But if the child process is forked after the parent process has +initialized, it can't work due to the drivers go to wrong branch on +initialization. + +And the algorithms initialization function is already protected +against re-entry. So it is unnecessary to check 'drv.priv' in driver. + +Signed-off-by: Yang Shen +--- + drv/hisi_comp.c | 7 +------ + drv/hisi_hpre.c | 34 ++++++++++++++-------------------- + drv/hisi_sec.c | 7 +------ + 3 files changed, 16 insertions(+), 32 deletions(-) + +diff --git a/drv/hisi_comp.c b/drv/hisi_comp.c +index 2cb9a6b..a1af567 100644 +--- a/drv/hisi_comp.c ++++ b/drv/hisi_comp.c +@@ -787,18 +787,13 @@ static void hisi_zip_sqe_ops_adapt(handle_t h_qp) + + static int hisi_zip_init(struct wd_alg_driver *drv, void *conf) + { +- struct hisi_zip_ctx *priv = (struct hisi_zip_ctx *)drv->priv; + struct wd_ctx_config_internal *config = conf; + struct hisi_qm_priv qm_priv; ++ struct hisi_zip_ctx *priv; + handle_t h_qp = 0; + handle_t h_ctx; + __u32 i, j; + +- if (priv) { +- /* return if already inited */ +- return 0; +- } +- + if (!config->ctx_num) { + WD_ERR("invalid: zip init config ctx num is 0!\n"); + return -WD_EINVAL; +diff --git a/drv/hisi_hpre.c b/drv/hisi_hpre.c +index 049e60e..babc795 100644 +--- a/drv/hisi_hpre.c ++++ b/drv/hisi_hpre.c +@@ -527,62 +527,56 @@ out: + static int hpre_rsa_dh_init(struct wd_alg_driver *drv, void *conf) + { + struct wd_ctx_config_internal *config = (struct wd_ctx_config_internal *)conf; +- struct hisi_hpre_ctx *priv = (struct hisi_hpre_ctx *)drv->priv; + struct hisi_qm_priv qm_priv; ++ struct hisi_hpre_ctx *priv; + int ret; + +- if (priv) { +- /* return if already inited */ +- return WD_SUCCESS; +- } +- + if (!config->ctx_num) { + WD_ERR("invalid: hpre rsa/dh init config ctx num is 0!\n"); + return -WD_EINVAL; + } + +- drv->priv = malloc(sizeof(struct hisi_hpre_ctx)); +- if (!drv->priv) ++ priv = malloc(sizeof(struct hisi_hpre_ctx)); ++ if (!priv) + return -WD_EINVAL; + + qm_priv.op_type = HPRE_HW_V2_ALG_TYPE; +- ret = hpre_init_qm_priv(config, drv->priv, &qm_priv); ++ ret = hpre_init_qm_priv(config, priv, &qm_priv); + if (ret) { +- free(drv->priv); ++ free(priv); + return ret; + } + ++ drv->priv = priv; ++ + return WD_SUCCESS; + } + + static int hpre_ecc_init(struct wd_alg_driver *drv, void *conf) + { + struct wd_ctx_config_internal *config = (struct wd_ctx_config_internal *)conf; +- struct hisi_hpre_ctx *priv = (struct hisi_hpre_ctx *)drv->priv; + struct hisi_qm_priv qm_priv; ++ struct hisi_hpre_ctx *priv; + int ret; + +- if (priv) { +- /* return if already inited */ +- return WD_SUCCESS; +- } +- + if (!config->ctx_num) { + WD_ERR("invalid: hpre ecc init config ctx num is 0!\n"); + return -WD_EINVAL; + } + +- drv->priv = malloc(sizeof(struct hisi_hpre_ctx)); +- if (!drv->priv) ++ priv = malloc(sizeof(struct hisi_hpre_ctx)); ++ if (!priv) + return -WD_EINVAL; + + qm_priv.op_type = HPRE_HW_V3_ECC_ALG_TYPE; +- ret = hpre_init_qm_priv(config, drv->priv, &qm_priv); ++ ret = hpre_init_qm_priv(config, priv, &qm_priv); + if (ret) { +- free(drv->priv); ++ free(priv); + return ret; + } + ++ drv->priv = priv; ++ + return WD_SUCCESS; + } + +diff --git a/drv/hisi_sec.c b/drv/hisi_sec.c +index ac62109..852340d 100644 +--- a/drv/hisi_sec.c ++++ b/drv/hisi_sec.c +@@ -3041,18 +3041,13 @@ static int hisi_sec_aead_recv_v3(struct wd_alg_driver *drv, handle_t ctx, void * + + static int hisi_sec_init(struct wd_alg_driver *drv, void *conf) + { +- struct hisi_sec_ctx *priv = (struct hisi_sec_ctx *)drv->priv; + struct wd_ctx_config_internal *config = conf; + struct hisi_qm_priv qm_priv; ++ struct hisi_sec_ctx *priv; + handle_t h_qp = 0; + handle_t h_ctx; + __u32 i, j; + +- if (priv) { +- /* return if already inited */ +- return 0; +- } +- + if (!config->ctx_num) { + WD_ERR("invalid: sec init config ctx num is 0!\n"); + return -WD_EINVAL; +-- +2.25.1 + diff --git a/0018-wd_rsa-fix-wd_rsa_common_uninit-re-entry.patch b/0018-wd_rsa-fix-wd_rsa_common_uninit-re-entry.patch new file mode 100644 index 0000000..c1a6a98 --- /dev/null +++ b/0018-wd_rsa-fix-wd_rsa_common_uninit-re-entry.patch @@ -0,0 +1,33 @@ +From 2366ff7e765c5c451ab761cd0f9f9f6fbde4add3 Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Thu, 1 Feb 2024 14:25:14 +0000 +Subject: [PATCH 18/44] wd_rsa: fix wd_rsa_common_uninit re-entry + +Fix wd_rsa_common_uninit re-entry + +Fixs: 3fc344a drivers alloc and free resources by themself +Signed-off-by: Zhangfei Gao +--- + wd_rsa.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/wd_rsa.c b/wd_rsa.c +index de0b796..8e51177 100644 +--- a/wd_rsa.c ++++ b/wd_rsa.c +@@ -167,6 +167,12 @@ out_clear_ctx_config: + + static int wd_rsa_common_uninit(void) + { ++ enum wd_status status; ++ ++ wd_alg_get_init(&wd_rsa_setting.status, &status); ++ if (status == WD_UNINIT) ++ return -WD_EINVAL; ++ + /* uninit async request pool */ + wd_uninit_async_request_pool(&wd_rsa_setting.pool); + +-- +2.25.1 + diff --git a/0019-wd_dh-Fix-wd_aead_uninit-re-entry.patch b/0019-wd_dh-Fix-wd_aead_uninit-re-entry.patch new file mode 100644 index 0000000..5f547ed --- /dev/null +++ b/0019-wd_dh-Fix-wd_aead_uninit-re-entry.patch @@ -0,0 +1,32 @@ +From 72d2f8d98ee7322463f66be3aa8dea7a9e0b0811 Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Tue, 19 Mar 2024 02:37:51 +0000 +Subject: [PATCH 19/44] wd_dh: Fix wd_aead_uninit re-entry + +Check status for the re-entry + +Signed-off-by: Zhangfei Gao +--- + wd_dh.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/wd_dh.c b/wd_dh.c +index d23bb61..4d08de6 100644 +--- a/wd_dh.c ++++ b/wd_dh.c +@@ -127,6 +127,12 @@ out_clear_ctx_config: + + static int wd_dh_common_uninit(void) + { ++ enum wd_status status; ++ ++ wd_alg_get_init(&wd_dh_setting.status, &status); ++ if (status == WD_UNINIT) ++ return -WD_EINVAL; ++ + /* uninit async request pool */ + wd_uninit_async_request_pool(&wd_dh_setting.pool); + +-- +2.25.1 + diff --git a/0020-wd_ecc-Fix-wd_ecc_uninit-re-entry.patch b/0020-wd_ecc-Fix-wd_ecc_uninit-re-entry.patch new file mode 100644 index 0000000..6d0e453 --- /dev/null +++ b/0020-wd_ecc-Fix-wd_ecc_uninit-re-entry.patch @@ -0,0 +1,32 @@ +From 105fec19d2f5008009504e9e051dc2aec42bd113 Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Tue, 19 Mar 2024 02:40:09 +0000 +Subject: [PATCH 20/44] wd_ecc: Fix wd_ecc_uninit re-entry + +Check status for the re-entry + +Signed-off-by: Zhangfei Gao +--- + wd_ecc.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/wd_ecc.c b/wd_ecc.c +index 9aa1519..e75bca0 100644 +--- a/wd_ecc.c ++++ b/wd_ecc.c +@@ -190,6 +190,12 @@ out_clear_ctx_config: + + static int wd_ecc_common_uninit(void) + { ++ enum wd_status status; ++ ++ wd_alg_get_init(&wd_ecc_setting.status, &status); ++ if (status == WD_UNINIT) ++ return -WD_EINVAL; ++ + /* uninit async request pool */ + wd_uninit_async_request_pool(&wd_ecc_setting.pool); + +-- +2.25.1 + diff --git a/0021-wd_digest-uninit-check-status-in-one-func.patch b/0021-wd_digest-uninit-check-status-in-one-func.patch new file mode 100644 index 0000000..23bf5b6 --- /dev/null +++ b/0021-wd_digest-uninit-check-status-in-one-func.patch @@ -0,0 +1,71 @@ +From f690d2e248be5270b9cdda6f2b8af18af580ab49 Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Tue, 19 Mar 2024 02:42:39 +0000 +Subject: [PATCH 21/44] wd_digest: uninit check status in one func + +To simplify code, checking status in one func + +Signed-off-by: Zhangfei Gao +--- + wd_digest.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +diff --git a/wd_digest.c b/wd_digest.c +index 10ac080..0df7204 100644 +--- a/wd_digest.c ++++ b/wd_digest.c +@@ -296,23 +296,29 @@ out_clear_init: + return ret; + } + +-static void wd_digest_uninit_nolock(void) ++static int wd_digest_uninit_nolock(void) + { ++ enum wd_status status; ++ ++ wd_alg_get_init(&wd_digest_setting.status, &status); ++ if (status == WD_UNINIT) ++ return -WD_EINVAL; ++ + wd_uninit_async_request_pool(&wd_digest_setting.pool); + wd_clear_sched(&wd_digest_setting.sched); + wd_alg_uninit_driver(&wd_digest_setting.config, + wd_digest_setting.driver); ++ return 0; + } + + void wd_digest_uninit(void) + { +- enum wd_status status; ++ int ret; + +- wd_alg_get_init(&wd_digest_setting.status, &status); +- if (status == WD_UNINIT) ++ ret = wd_digest_uninit_nolock(); ++ if (ret) + return; + +- wd_digest_uninit_nolock(); + wd_digest_close_driver(); + wd_alg_clear_init(&wd_digest_setting.status); + } +@@ -419,13 +425,12 @@ out_uninit: + + void wd_digest_uninit2(void) + { +- enum wd_status status; ++ int ret; + +- wd_alg_get_init(&wd_digest_setting.status, &status); +- if (status == WD_UNINIT) ++ ret = wd_digest_uninit_nolock(); ++ if (ret) + return; + +- wd_digest_uninit_nolock(); + wd_alg_attrs_uninit(&wd_digest_init_attrs); + wd_alg_drv_unbind(wd_digest_setting.driver); + wd_dlclose_drv(wd_digest_setting.dlh_list); +-- +2.25.1 + diff --git a/0022-wd_aead-uninit-check-status-in-one-func.patch b/0022-wd_aead-uninit-check-status-in-one-func.patch new file mode 100644 index 0000000..e5e6a53 --- /dev/null +++ b/0022-wd_aead-uninit-check-status-in-one-func.patch @@ -0,0 +1,72 @@ +From e726680f9c8c9bfcf143d529be34d5b7ce2157be Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Tue, 19 Mar 2024 02:44:21 +0000 +Subject: [PATCH 22/44] wd_aead: uninit check status in one func + +To simplify code, checking status in one func + +Signed-off-by: Zhangfei Gao +--- + wd_aead.c | 24 +++++++++++++++--------- + 1 file changed, 15 insertions(+), 9 deletions(-) + +diff --git a/wd_aead.c b/wd_aead.c +index 34a3b86..57daa80 100644 +--- a/wd_aead.c ++++ b/wd_aead.c +@@ -485,23 +485,30 @@ out_clear_init: + return ret; + } + +-static void wd_aead_uninit_nolock(void) ++static int wd_aead_uninit_nolock(void) + { ++ enum wd_status status; ++ ++ wd_alg_get_init(&wd_aead_setting.status, &status); ++ if (status == WD_UNINIT) ++ return -WD_EINVAL; ++ + wd_uninit_async_request_pool(&wd_aead_setting.pool); + wd_clear_sched(&wd_aead_setting.sched); + wd_alg_uninit_driver(&wd_aead_setting.config, + wd_aead_setting.driver); ++ ++ return 0; + } + + void wd_aead_uninit(void) + { +- enum wd_status status; ++ int ret; + +- wd_alg_get_init(&wd_aead_setting.status, &status); +- if (status == WD_UNINIT) ++ ret = wd_aead_uninit_nolock(); ++ if (ret) + return; + +- wd_aead_uninit_nolock(); + wd_aead_close_driver(); + wd_alg_clear_init(&wd_aead_setting.status); + } +@@ -614,13 +621,12 @@ out_uninit: + + void wd_aead_uninit2(void) + { +- enum wd_status status; ++ int ret; + +- wd_alg_get_init(&wd_aead_setting.status, &status); +- if (status == WD_UNINIT) ++ ret = wd_aead_uninit_nolock(); ++ if (ret) + return; + +- wd_aead_uninit_nolock(); + wd_alg_attrs_uninit(&wd_aead_init_attrs); + wd_alg_drv_unbind(wd_aead_setting.driver); + wd_dlclose_drv(wd_aead_setting.dlh_list); +-- +2.25.1 + diff --git a/0023-makefile-install-wd_zlibwrapper.h-to-system.patch b/0023-makefile-install-wd_zlibwrapper.h-to-system.patch new file mode 100644 index 0000000..7f29306 --- /dev/null +++ b/0023-makefile-install-wd_zlibwrapper.h-to-system.patch @@ -0,0 +1,39 @@ +From 9d4a68db517d42ac3cb9ae66aabfb2ea73303344 Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Sun, 10 Mar 2024 13:57:53 +0000 +Subject: [PATCH 23/44] makefile: install wd_zlibwrapper.h to system + +wd_zlibwrapper.h is requird by other sub-system, so move +it to system header folder like /usr/local/include/uadk/ + +Signed-off-by: Zhangfei Gao +--- + Makefile.am | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/Makefile.am b/Makefile.am +index 19eab30..cd3d7e5 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -35,7 +35,8 @@ AM_CFLAGS+= -DUADK_RELEASED_TIME="\"Released ${MONTH} ${DAY}, ${YEAR}\"" + pkginclude_HEADERS = include/wd.h include/wd_cipher.h include/wd_aead.h \ + include/wd_comp.h include/wd_dh.h include/wd_digest.h \ + include/wd_rsa.h include/uacce.h include/wd_alg_common.h \ +- include/wd_ecc.h include/wd_sched.h include/wd_alg.h ++ include/wd_ecc.h include/wd_sched.h include/wd_alg.h \ ++ include/wd_zlibwrapper.h + + nobase_pkginclude_HEADERS = v1/wd.h v1/wd_cipher.h v1/wd_aead.h v1/uacce.h v1/wd_dh.h \ + v1/wd_digest.h v1/wd_rsa.h v1/wd_bmm.h +@@ -67,7 +68,7 @@ libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \ + v1/drv/hisi_rng_udrv.c v1/drv/hisi_rng_udrv.h + + libwd_comp_la_SOURCES=wd_comp.c wd_comp.h wd_comp_drv.h wd_util.c wd_util.h \ +- wd_sched.c wd_sched.h wd.c wd.h wd_zlibwrapper.c wd_zlibwrapper.h ++ wd_sched.c wd_sched.h wd.c wd.h wd_zlibwrapper.c + + libhisi_zip_la_SOURCES=drv/hisi_comp.c hisi_comp.h drv/hisi_qm_udrv.c \ + hisi_qm_udrv.h wd_comp_drv.h +-- +2.25.1 + diff --git a/0024-conf-fix-includedir.patch b/0024-conf-fix-includedir.patch new file mode 100644 index 0000000..002989d --- /dev/null +++ b/0024-conf-fix-includedir.patch @@ -0,0 +1,33 @@ +From 9590cd2df74a0bf82fa4d3420e851792195f782a Mon Sep 17 00:00:00 2001 +From: Zhangfei Gao +Date: Sun, 10 Mar 2024 14:02:03 +0000 +Subject: [PATCH 24/44] conf: fix includedir + +pkgincludedir already appended $(PACKAGE) [1], so no need +adding "uadk". Otherwise, header files will be installed to +"/usr/local/include/uadk/uadk/" + +[1] https://www.sourceware.org/autobook/autobook/autobook_76.html +pkgincludedir +This is a convenience variable whose value is +"$(includedir)/$(PACKAGE)". + +Signed-off-by: Zhangfei Gao +--- + conf.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/conf.sh b/conf.sh +index 59af821..c361fbc 100755 +--- a/conf.sh ++++ b/conf.sh +@@ -16,5 +16,5 @@ ac_cv_func_malloc_0_nonnull=yes ac_cv_func_realloc_0_nonnull=yes ./configure \ + --enable-perf=yes \ + --host aarch64-linux-gnu \ + --target aarch64-linux-gnu \ +- --includedir=/usr/local/include/uadk \ ++ --includedir=/usr/local/include/ \ + $COMPILE_TYPE +-- +2.25.1 + diff --git a/0025-cipher-add-support-for-SM4-CBC-and-CTR-modes-in-CE-i.patch b/0025-cipher-add-support-for-SM4-CBC-and-CTR-modes-in-CE-i.patch new file mode 100644 index 0000000..7fbb8f4 --- /dev/null +++ b/0025-cipher-add-support-for-SM4-CBC-and-CTR-modes-in-CE-i.patch @@ -0,0 +1,1170 @@ +From eec2accd50fffe1399151112f53f4061b0eef2f0 Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Wed, 20 Mar 2024 16:11:22 +0800 +Subject: [PATCH 25/44] cipher: add support for SM4 CBC and CTR modes in CE + instruction + +This patch implements the CE instruction using SM4 CBC and CTR modes, +and includes the necessary logic for mode-specific operations, +such as generating initialization vectors (IV) and handling chaining +and counter values. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + Makefile.am | 5 +- + drv/isa_ce_sm4.c | 235 +++++++++++++ + drv/isa_ce_sm4.h | 38 ++ + drv/isa_ce_sm4_armv8.S | 774 +++++++++++++++++++++++++++++++++++++++++ + v1/wd.c | 3 +- + v1/wd_rng.c | 4 +- + wd_cipher.c | 4 +- + 7 files changed, 1056 insertions(+), 7 deletions(-) + create mode 100644 drv/isa_ce_sm4.c + create mode 100644 drv/isa_ce_sm4.h + create mode 100644 drv/isa_ce_sm4_armv8.S + +diff --git a/Makefile.am b/Makefile.am +index cd3d7e5..f78ad14 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -78,6 +78,7 @@ libwd_crypto_la_SOURCES=wd_cipher.c wd_cipher.h wd_cipher_drv.h \ + wd_rsa.c wd_rsa.h wd_rsa_drv.h \ + wd_dh.c wd_dh.h wd_dh_drv.h \ + wd_ecc.c wd_ecc.h wd_ecc_drv.h \ ++ arm_arch_ce.h isa_ce_sm3.h isa_ce_sm4.h \ + wd_digest.c wd_digest.h wd_digest_drv.h \ + wd_util.c wd_util.h \ + wd_sched.c wd_sched.h \ +@@ -90,8 +91,8 @@ libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \ + libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ + hisi_qm_udrv.h + +-libisa_ce_la_SOURCES=drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S arm_arch_ce.h \ +- drv/isa_ce_sm3.h ++libisa_ce_la_SOURCES=arm_arch_ce.h drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S isa_ce_sm3.h \ ++ drv/isa_ce_sm4.c drv/isa_ce_sm4_armv8.S drv/isa_ce_sm4.h + + if WD_STATIC_DRV + AM_CFLAGS += -DWD_STATIC_DRV -fPIC +diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c +new file mode 100644 +index 0000000..e2d81de +--- /dev/null ++++ b/drv/isa_ce_sm4.c +@@ -0,0 +1,235 @@ ++// SPDX-License-Identifier: Apache-2.0 ++/* ++ * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++/* ++ * Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. ++ */ ++ ++#include "drv/wd_cipher_drv.h" ++#include "wd_cipher.h" ++#include "isa_ce_sm4.h" ++ ++#define SM4_ENCRYPT 1 ++#define SM4_DECRYPT 0 ++#define MSG_Q_DEPTH 1024 ++#define INCREASE_BYTES 12 ++#define SM4_BLOCK_SIZE 16 ++#define MAX_BLOCK_NUM (1U << 28) ++#define CTR96_SHIFT_BITS 8 ++ ++#define GETU32(p) \ ++ ((__u32)(p)[0] << 24 | (__u32)(p)[1] << 16 | (__u32)(p)[2] << 8 | (__u32)(p)[3]) ++#define PUTU32(p, v) \ ++ ((p)[0] = (__u8)((v) >> 24), (p)[1] = (__u8)((v) >> 16), \ ++ (p)[2] = (__u8)((v) >> 8), (p)[3] = (__u8)(v)) ++ ++static int isa_ce_init(struct wd_alg_driver *drv, void *conf) ++{ ++ struct wd_ctx_config_internal *config = conf; ++ struct sm4_ce_drv_ctx *sctx = drv->priv; ++ ++ config->epoll_en = 0; ++ memcpy(&sctx->config, config, sizeof(struct wd_ctx_config_internal)); ++ ++ return 0; ++} ++ ++static void isa_ce_exit(struct wd_alg_driver *drv) ++{ ++} ++ ++/* increment upper 96 bits of 128-bit counter by 1 */ ++static void ctr96_inc(__u8 *counter) ++{ ++ __u32 n = INCREASE_BYTES; ++ __u32 c = 1; ++ ++ do { ++ --n; ++ c += counter[n]; ++ counter[n] = (__u8)c; ++ c >>= CTR96_SHIFT_BITS; ++ } while (n); ++} ++ ++static void sm4_v8_ctr32_encrypt(__u8 *in, __u8 *out, ++ __u64 len, const struct SM4_KEY *key, __u8 *iv) ++{ ++ __u8 ecount_buf[SM4_BLOCK_SIZE] = {0}; ++ __u64 blocks, offset; ++ __u32 ctr32; ++ __u32 n = 0; ++ ++ ctr32 = GETU32(iv + INCREASE_BYTES); ++ while (len >= SM4_BLOCK_SIZE) { ++ blocks = len / SM4_BLOCK_SIZE; ++ /* ++ * 1<<28 is just a not-so-small yet not-so-large number... ++ * Below condition is practically never met, but it has to ++ * be checked for code correctness. ++ */ ++ if (blocks > MAX_BLOCK_NUM) ++ blocks = MAX_BLOCK_NUM; ++ /* ++ * As (*func) operates on 32-bit counter, caller ++ * has to handle overflow. 'if' below detects the ++ * overflow, which is then handled by limiting the ++ * amount of blocks to the exact overflow point... ++ */ ++ ctr32 += (__u32)blocks; ++ if (ctr32 < blocks) { ++ blocks -= ctr32; ++ ctr32 = 0; ++ } ++ sm4_v8_ctr32_encrypt_blocks(in, out, blocks, key, iv); ++ /* (*ctr) does not update iv, caller does: */ ++ PUTU32(iv + INCREASE_BYTES, ctr32); ++ /* ... overflow was detected, propagate carry. */ ++ if (ctr32 == 0) ++ ctr96_inc(iv); ++ offset = blocks * SM4_BLOCK_SIZE; ++ len -= offset; ++ out += offset; ++ in += offset; ++ } ++ if (len) { ++ sm4_v8_ctr32_encrypt_blocks(ecount_buf, ecount_buf, 1, key, iv); ++ ++ctr32; ++ PUTU32(iv + INCREASE_BYTES, ctr32); ++ if (ctr32 == 0) ++ ctr96_inc(iv); ++ while (len--) { ++ out[n] = in[n] ^ ecount_buf[n]; ++ ++n; ++ } ++ } ++} ++ ++static void sm4_ctr_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) ++{ ++ sm4_v8_ctr32_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, msg->iv); ++} ++ ++static void sm4_cbc_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) ++{ ++ sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, msg->iv, SM4_ENCRYPT); ++} ++ ++static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) ++{ ++ sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); ++} ++ ++void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) ++{ ++ sm4_v8_set_encrypt_key(userKey, key); ++} ++ ++void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) ++{ ++ sm4_v8_set_decrypt_key(userKey, key); ++} ++ ++static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_msg) ++{ ++ struct wd_cipher_msg *msg = wd_msg; ++ struct SM4_KEY rkey; ++ ++ if (!msg) { ++ WD_ERR("invalid: input sm4 msg is NULL!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (msg->data_fmt == WD_SGL_BUF) { ++ WD_ERR("invalid: SM4 CE driver do not support sgl data format!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR) ++ sm4_set_encrypt_key(msg->key, &rkey); ++ else ++ sm4_set_decrypt_key(msg->key, &rkey); ++ ++ switch (msg->mode) { ++ case WD_CIPHER_CBC: ++ if (msg->op_type == WD_CIPHER_ENCRYPTION) ++ sm4_cbc_encrypt(msg, &rkey); ++ else ++ sm4_cbc_decrypt(msg, &rkey); ++ break; ++ case WD_CIPHER_CTR: ++ sm4_ctr_encrypt(msg, &rkey); ++ break; ++ default: ++ WD_ERR("The current block cipher mode is not supported!\n"); ++ return -WD_EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int isa_ce_cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *wd_msg) ++{ ++ return 0; ++} ++ ++static int cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *msg) ++{ ++ return isa_ce_cipher_send(drv, ctx, msg); ++} ++ ++static int cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg) ++{ ++ return isa_ce_cipher_recv(drv, ctx, msg); ++} ++ ++#define GEN_CE_ALG_DRIVER(ce_alg_name, alg_type) \ ++{\ ++ .drv_name = "isa_ce_sm4",\ ++ .alg_name = (ce_alg_name),\ ++ .calc_type = UADK_ALG_CE_INSTR,\ ++ .priority = 200,\ ++ .op_type_num = 1,\ ++ .fallback = 0,\ ++ .init = isa_ce_init,\ ++ .exit = isa_ce_exit,\ ++ .send = alg_type##_send,\ ++ .recv = alg_type##_recv,\ ++} ++ ++static struct wd_alg_driver cipher_alg_driver[] = { ++ GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), ++}; ++ ++static void __attribute__((constructor)) isa_ce_probe(void) ++{ ++ __u32 alg_num, i; ++ int ret; ++ ++ WD_INFO("Info: register SM4 CE alg drivers!\n"); ++ ++ alg_num = ARRAY_SIZE(cipher_alg_driver); ++ for (i = 0; i < alg_num; i++) { ++ ret = wd_alg_driver_register(&cipher_alg_driver[i]); ++ if (ret && ret != -WD_ENODEV) ++ WD_ERR("Error: register SM4 CE %s failed!\n", ++ cipher_alg_driver[i].alg_name); ++ } ++} ++ ++static void __attribute__((destructor)) isa_ce_remove(void) ++{ ++ __u32 alg_num, i; ++ ++ WD_INFO("Info: unregister SM4 CE alg drivers!\n"); ++ alg_num = ARRAY_SIZE(cipher_alg_driver); ++ for (i = 0; i < alg_num; i++) ++ wd_alg_driver_unregister(&cipher_alg_driver[i]); ++} +diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h +new file mode 100644 +index 0000000..0bc074d +--- /dev/null ++++ b/drv/isa_ce_sm4.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */ ++ ++#ifndef __SM4_CE_DRV_H ++#define __SM4_CE_DRV_H ++ ++#pragma once ++#include ++#include "wd_alg_common.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define SM4_KEY_SCHEDULE 32 ++ ++struct SM4_KEY { ++ __u32 rk[SM4_KEY_SCHEDULE]; ++}; ++ ++struct sm4_ce_drv_ctx { ++ struct wd_ctx_config_internal config; ++}; ++ ++ ++void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); ++void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); ++void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const struct SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, const unsigned char ivec[16]); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* __SM4_CE_DRV_H */ +diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S +new file mode 100644 +index 0000000..d7d172a +--- /dev/null ++++ b/drv/isa_ce_sm4_armv8.S +@@ -0,0 +1,774 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* ++ * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++#include "../include/drv/arm_arch_ce.h" ++ ++.arch armv8-a+crypto ++ ++.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ ++ 16, 17, 18, 19, 20, 21, 22, 23 24, 25, 26, 27, 28, 29, 30, 31 ++ .set .Lv\b\().4s, \b ++.endr ++ ++.macro sm4e, vd, vn ++ .inst 0xcec08400 | (.L\vn << 5) | .L\vd ++.endm ++ ++.macro sm4ekey, vd, vn, vm ++ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd ++.endm ++ ++.text ++.align 6 ++.Lck: ++.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc ++.globl sm4_v8_set_encrypt_key ++.type sm4_v8_set_encrypt_key,%function ++.align 5 ++sm4_v8_set_encrypt_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {v0.4s},[x0] ++ adr x2,.Lfk ++ ld1 {v24.4s},[x2] ++ adr x2,.Lck ++ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 ++#ifndef __ARMEB__ ++ rev32 v0.16b,v0.16b ++#endif ++ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] ++ eor v0.16b,v0.16b,v24.16b; ++ sm4ekey v0.4s,v0.4s,v16.4s; ++ sm4ekey v1.4s,v0.4s,v17.4s; ++ sm4ekey v2.4s,v1.4s,v18.4s; ++ sm4ekey v3.4s,v2.4s,v19.4s; ++ sm4ekey v4.4s,v3.4s,v20.4s; ++ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 ++ sm4ekey v5.4s,v4.4s,v21.4s; ++ sm4ekey v6.4s,v5.4s,v22.4s; ++ sm4ekey v7.4s,v6.4s,v23.4s; ++ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] ++ ret ++.size sm4_v8_set_encrypt_key,.-sm4_v8_set_encrypt_key ++.globl sm4_v8_set_decrypt_key ++.type sm4_v8_set_decrypt_key,%function ++.align 5 ++sm4_v8_set_decrypt_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {v7.4s},[x0] ++ adr x2,.Lfk ++ ld1 {v24.4s},[x2] ++ adr x2, .Lck ++ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x2],64 ++#ifndef __ARMEB__ ++ rev32 v7.16b,v7.16b ++#endif ++ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x2] ++ eor v7.16b, v7.16b,v24.16b; ++ sm4ekey v7.4s,v7.4s,v16.4s; ++ sm4ekey v6.4s,v7.4s,v17.4s; ++ sm4ekey v5.4s,v6.4s,v18.4s; ++ rev64 v7.4s,v7.4s ++ rev64 v6.4s,v6.4s ++ ext v7.16b,v7.16b,v7.16b,#8 ++ ext v6.16b,v6.16b,v6.16b,#8 ++ sm4ekey v4.4s,v5.4s,v19.4s; ++ sm4ekey v3.4s,v4.4s,v20.4s; ++ rev64 v5.4s,v5.4s ++ rev64 v4.4s,v4.4s ++ ext v5.16b,v5.16b,v5.16b,#8 ++ ext v4.16b,v4.16b,v4.16b,#8 ++ sm4ekey v2.4s,v3.4s,v21.4s; ++ sm4ekey v1.4s,v2.4s,v22.4s; ++ rev64 v3.4s,v3.4s ++ rev64 v2.4s,v2.4s ++ ext v3.16b,v3.16b,v3.16b,#8 ++ ext v2.16b,v2.16b,v2.16b,#8 ++ sm4ekey v0.4s,v1.4s,v23.4s; ++ rev64 v1.4s, v1.4s ++ rev64 v0.4s, v0.4s ++ ext v1.16b,v1.16b,v1.16b,#8 ++ ext v0.16b,v0.16b,v0.16b,#8 ++ st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],64 ++ st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1] ++ ret ++.size sm4_v8_set_decrypt_key,.-sm4_v8_set_decrypt_key ++.globl sm4_v8_cbc_encrypt ++.type sm4_v8_cbc_encrypt,%function ++.align 5 ++sm4_v8_cbc_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 ++ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] ++ ld1 {v8.4s},[x4] ++ cmp w5,#0 ++ b.eq .Ldec ++1: ++ cmp x2, #64 ++ b.lt 1f ++ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 ++ eor v16.16b,v16.16b,v8.16b ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v16.4s,v1.4s; ++ sm4e v16.4s,v2.4s; ++ sm4e v16.4s,v3.4s; ++ sm4e v16.4s,v4.4s; ++ sm4e v16.4s,v5.4s; ++ sm4e v16.4s,v6.4s; ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ ext v16.16b,v16.16b,v16.16b,#8 ++ eor v17.16b,v17.16b,v16.16b ++ sm4e v17.4s,v0.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v17.4s,v7.4s; ++ rev64 v17.4s,v17.4s ++ ext v17.16b,v17.16b,v17.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++ eor v18.16b,v18.16b,v17.16b ++ sm4e v18.4s,v0.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v18.4s,v7.4s; ++ rev64 v18.4s,v18.4s ++ ext v18.16b,v18.16b,v18.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++ eor v19.16b,v19.16b,v18.16b ++ sm4e v19.4s,v0.4s; ++ sm4e v19.4s,v1.4s; ++ sm4e v19.4s,v2.4s; ++ sm4e v19.4s,v3.4s; ++ sm4e v19.4s,v4.4s; ++ sm4e v19.4s,v5.4s; ++ sm4e v19.4s,v6.4s; ++ sm4e v19.4s,v7.4s; ++ rev64 v19.4s,v19.4s ++ ext v19.16b,v19.16b,v19.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ mov v8.16b,v19.16b ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++ subs x2,x2,#64 ++ b.ne 1b ++1: ++ subs x2,x2,#16 ++ b.lt 3f ++ ld1 {v16.4s},[x0],#16 ++ eor v8.16b,v8.16b,v16.16b ++#ifndef __ARMEB__ ++ rev32 v8.16b,v8.16b ++#endif ++ sm4e v8.4s,v0.4s; ++ sm4e v8.4s,v1.4s; ++ sm4e v8.4s,v2.4s; ++ sm4e v8.4s,v3.4s; ++ sm4e v8.4s,v4.4s; ++ sm4e v8.4s,v5.4s; ++ sm4e v8.4s,v6.4s; ++ sm4e v8.4s,v7.4s; ++ rev64 v8.4s,v8.4s ++ ext v8.16b,v8.16b,v8.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v8.16b,v8.16b ++#endif ++ st1 {v8.16b},[x1],#16 ++ b.ne 1b ++ b 3f ++.Ldec: ++1: ++ cmp x2, #64 ++ b.lt 1f ++ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0] ++ ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 ++ cmp x2,#128 ++ b.lt 2f ++ // 8 blocks mode ++ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0] ++ ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v20.16b,v20.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v21.16b,v21.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v22.16b,v22.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v23.16b,v23.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v17.4s,v0.4s; ++ sm4e v18.4s,v0.4s; ++ sm4e v19.4s,v0.4s; ++ ++ sm4e v16.4s,v1.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v19.4s,v1.4s; ++ ++ sm4e v16.4s,v2.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v19.4s,v2.4s; ++ ++ sm4e v16.4s,v3.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v19.4s,v3.4s; ++ ++ sm4e v16.4s,v4.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v19.4s,v4.4s; ++ ++ sm4e v16.4s,v5.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v19.4s,v5.4s; ++ ++ sm4e v16.4s,v6.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v19.4s,v6.4s; ++ ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ sm4e v17.4s,v7.4s; ++ ext v16.16b,v16.16b,v16.16b,#8 ++ rev64 v17.4s,v17.4s ++ sm4e v18.4s,v7.4s; ++ ext v17.16b,v17.16b,v17.16b,#8 ++ rev64 v18.4s,v18.4s ++ sm4e v19.4s,v7.4s; ++ ext v18.16b,v18.16b,v18.16b,#8 ++ rev64 v19.4s,v19.4s ++ ext v19.16b,v19.16b,v19.16b,#8 ++ sm4e v20.4s,v0.4s; ++ sm4e v21.4s,v0.4s; ++ sm4e v22.4s,v0.4s; ++ sm4e v23.4s,v0.4s; ++ ++ sm4e v20.4s,v1.4s; ++ sm4e v21.4s,v1.4s; ++ sm4e v22.4s,v1.4s; ++ sm4e v23.4s,v1.4s; ++ ++ sm4e v20.4s,v2.4s; ++ sm4e v21.4s,v2.4s; ++ sm4e v22.4s,v2.4s; ++ sm4e v23.4s,v2.4s; ++ ++ sm4e v20.4s,v3.4s; ++ sm4e v21.4s,v3.4s; ++ sm4e v22.4s,v3.4s; ++ sm4e v23.4s,v3.4s; ++ ++ sm4e v20.4s,v4.4s; ++ sm4e v21.4s,v4.4s; ++ sm4e v22.4s,v4.4s; ++ sm4e v23.4s,v4.4s; ++ ++ sm4e v20.4s,v5.4s; ++ sm4e v21.4s,v5.4s; ++ sm4e v22.4s,v5.4s; ++ sm4e v23.4s,v5.4s; ++ ++ sm4e v20.4s,v6.4s; ++ sm4e v21.4s,v6.4s; ++ sm4e v22.4s,v6.4s; ++ sm4e v23.4s,v6.4s; ++ ++ sm4e v20.4s,v7.4s; ++ rev64 v20.4s,v20.4s ++ sm4e v21.4s,v7.4s; ++ ext v20.16b,v20.16b,v20.16b,#8 ++ rev64 v21.4s,v21.4s ++ sm4e v22.4s,v7.4s; ++ ext v21.16b,v21.16b,v21.16b,#8 ++ rev64 v22.4s,v22.4s ++ sm4e v23.4s,v7.4s; ++ ext v22.16b,v22.16b,v22.16b,#8 ++ rev64 v23.4s,v23.4s ++ ext v23.16b,v23.16b,v23.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v20.16b,v20.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v21.16b,v21.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v22.16b,v22.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v23.16b,v23.16b ++#endif ++ eor v16.16b,v16.16b,v8.16b ++ eor v17.16b,v17.16b,v24.16b ++ eor v18.16b,v18.16b,v25.16b ++ mov v8.16b,v31.16b ++ eor v19.16b,v19.16b,v26.16b ++ eor v20.16b,v20.16b,v27.16b ++ eor v21.16b,v21.16b,v28.16b ++ eor v22.16b,v22.16b,v29.16b ++ eor v23.16b,v23.16b,v30.16b ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++ st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 ++ subs x2,x2,128 ++ b.gt 1b ++ b 3f ++ // 4 blocks mode ++2: ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v17.4s,v0.4s; ++ sm4e v18.4s,v0.4s; ++ sm4e v19.4s,v0.4s; ++ ++ sm4e v16.4s,v1.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v19.4s,v1.4s; ++ ++ sm4e v16.4s,v2.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v19.4s,v2.4s; ++ ++ sm4e v16.4s,v3.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v19.4s,v3.4s; ++ ++ sm4e v16.4s,v4.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v19.4s,v4.4s; ++ ++ sm4e v16.4s,v5.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v19.4s,v5.4s; ++ ++ sm4e v16.4s,v6.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v19.4s,v6.4s; ++ ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ sm4e v17.4s,v7.4s; ++ ext v16.16b,v16.16b,v16.16b,#8 ++ rev64 v17.4s,v17.4s ++ sm4e v18.4s,v7.4s; ++ ext v17.16b,v17.16b,v17.16b,#8 ++ rev64 v18.4s,v18.4s ++ sm4e v19.4s,v7.4s; ++ ext v18.16b,v18.16b,v18.16b,#8 ++ rev64 v19.4s,v19.4s ++ ext v19.16b,v19.16b,v19.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ eor v16.16b,v16.16b,v8.16b ++ eor v17.16b,v17.16b,v24.16b ++ mov v8.16b,v27.16b ++ eor v18.16b,v18.16b,v25.16b ++ eor v19.16b,v19.16b,v26.16b ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++ subs x2,x2,#64 ++ b.gt 1b ++1: ++ subs x2,x2,#16 ++ b.lt 3f ++ ld1 {v16.4s},[x0],#16 ++ mov v24.16b,v16.16b ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v16.4s,v1.4s; ++ sm4e v16.4s,v2.4s; ++ sm4e v16.4s,v3.4s; ++ sm4e v16.4s,v4.4s; ++ sm4e v16.4s,v5.4s; ++ sm4e v16.4s,v6.4s; ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ ext v16.16b,v16.16b,v16.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++ eor v16.16b,v16.16b,v8.16b ++ mov v8.16b,v24.16b ++ st1 {v16.16b},[x1],#16 ++ b.ne 1b ++3: ++ // save back IV ++ st1 {v8.16b},[x4] ++ ldp d8,d9,[sp],#16 ++ ret ++.size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt ++.globl sm4_v8_ctr32_encrypt_blocks ++.type sm4_v8_ctr32_encrypt_blocks,%function ++.align 5 ++sm4_v8_ctr32_encrypt_blocks: ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {v8.4s},[x4] ++ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],64 ++ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] ++#ifndef __ARMEB__ ++ rev32 v8.16b,v8.16b ++#endif ++ mov w5,v8.s[3] ++1: ++ cmp x2,#4 ++ b.lt 1f ++ ld1 {v24.4s,v25.4s,v26.4s,v27.4s},[x0],#64 ++ mov v16.16b,v8.16b ++ mov v17.16b,v8.16b ++ mov v18.16b,v8.16b ++ mov v19.16b,v8.16b ++ add w5,w5,#1 ++ mov v17.s[3],w5 ++ add w5,w5,#1 ++ mov v18.s[3],w5 ++ add w5,w5,#1 ++ mov v19.s[3],w5 ++ cmp x2,#8 ++ b.lt 2f ++ ld1 {v28.4s,v29.4s,v30.4s,v31.4s},[x0],#64 ++ mov v20.16b,v8.16b ++ mov v21.16b,v8.16b ++ mov v22.16b,v8.16b ++ mov v23.16b,v8.16b ++ add w5,w5,#1 ++ mov v20.s[3],w5 ++ add w5,w5,#1 ++ mov v21.s[3],w5 ++ add w5,w5,#1 ++ mov v22.s[3],w5 ++ add w5,w5,#1 ++ mov v23.s[3],w5 ++ sm4e v16.4s,v0.4s; ++ sm4e v17.4s,v0.4s; ++ sm4e v18.4s,v0.4s; ++ sm4e v19.4s,v0.4s; ++ ++ sm4e v16.4s,v1.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v19.4s,v1.4s; ++ ++ sm4e v16.4s,v2.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v19.4s,v2.4s; ++ ++ sm4e v16.4s,v3.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v19.4s,v3.4s; ++ ++ sm4e v16.4s,v4.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v19.4s,v4.4s; ++ ++ sm4e v16.4s,v5.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v19.4s,v5.4s; ++ ++ sm4e v16.4s,v6.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v19.4s,v6.4s; ++ ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ sm4e v17.4s,v7.4s; ++ ext v16.16b,v16.16b,v16.16b,#8 ++ rev64 v17.4s,v17.4s ++ sm4e v18.4s,v7.4s; ++ ext v17.16b,v17.16b,v17.16b,#8 ++ rev64 v18.4s,v18.4s ++ sm4e v19.4s,v7.4s; ++ ext v18.16b,v18.16b,v18.16b,#8 ++ rev64 v19.4s,v19.4s ++ ext v19.16b,v19.16b,v19.16b,#8 ++ sm4e v20.4s,v0.4s; ++ sm4e v21.4s,v0.4s; ++ sm4e v22.4s,v0.4s; ++ sm4e v23.4s,v0.4s; ++ ++ sm4e v20.4s,v1.4s; ++ sm4e v21.4s,v1.4s; ++ sm4e v22.4s,v1.4s; ++ sm4e v23.4s,v1.4s; ++ ++ sm4e v20.4s,v2.4s; ++ sm4e v21.4s,v2.4s; ++ sm4e v22.4s,v2.4s; ++ sm4e v23.4s,v2.4s; ++ ++ sm4e v20.4s,v3.4s; ++ sm4e v21.4s,v3.4s; ++ sm4e v22.4s,v3.4s; ++ sm4e v23.4s,v3.4s; ++ ++ sm4e v20.4s,v4.4s; ++ sm4e v21.4s,v4.4s; ++ sm4e v22.4s,v4.4s; ++ sm4e v23.4s,v4.4s; ++ ++ sm4e v20.4s,v5.4s; ++ sm4e v21.4s,v5.4s; ++ sm4e v22.4s,v5.4s; ++ sm4e v23.4s,v5.4s; ++ ++ sm4e v20.4s,v6.4s; ++ sm4e v21.4s,v6.4s; ++ sm4e v22.4s,v6.4s; ++ sm4e v23.4s,v6.4s; ++ ++ sm4e v20.4s,v7.4s; ++ rev64 v20.4s,v20.4s ++ sm4e v21.4s,v7.4s; ++ ext v20.16b,v20.16b,v20.16b,#8 ++ rev64 v21.4s,v21.4s ++ sm4e v22.4s,v7.4s; ++ ext v21.16b,v21.16b,v21.16b,#8 ++ rev64 v22.4s,v22.4s ++ sm4e v23.4s,v7.4s; ++ ext v22.16b,v22.16b,v22.16b,#8 ++ rev64 v23.4s,v23.4s ++ ext v23.16b,v23.16b,v23.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v20.16b,v20.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v21.16b,v21.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v22.16b,v22.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v23.16b,v23.16b ++#endif ++ eor v16.16b,v16.16b,v24.16b ++ eor v17.16b,v17.16b,v25.16b ++ eor v18.16b,v18.16b,v26.16b ++ eor v19.16b,v19.16b,v27.16b ++ eor v20.16b,v20.16b,v28.16b ++ eor v21.16b,v21.16b,v29.16b ++ eor v22.16b,v22.16b,v30.16b ++ eor v23.16b,v23.16b,v31.16b ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++ st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 ++ subs x2,x2,#8 ++ b.eq 3f ++ add w5,w5,#1 ++ mov v8.s[3],w5 ++ b 1b ++2: ++ sm4e v16.4s,v0.4s; ++ sm4e v17.4s,v0.4s; ++ sm4e v18.4s,v0.4s; ++ sm4e v19.4s,v0.4s; ++ ++ sm4e v16.4s,v1.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v19.4s,v1.4s; ++ ++ sm4e v16.4s,v2.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v19.4s,v2.4s; ++ ++ sm4e v16.4s,v3.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v19.4s,v3.4s; ++ ++ sm4e v16.4s,v4.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v19.4s,v4.4s; ++ ++ sm4e v16.4s,v5.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v19.4s,v5.4s; ++ ++ sm4e v16.4s,v6.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v19.4s,v6.4s; ++ ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ sm4e v17.4s,v7.4s; ++ ext v16.16b,v16.16b,v16.16b,#8 ++ rev64 v17.4s,v17.4s ++ sm4e v18.4s,v7.4s; ++ ext v17.16b,v17.16b,v17.16b,#8 ++ rev64 v18.4s,v18.4s ++ sm4e v19.4s,v7.4s; ++ ext v18.16b,v18.16b,v18.16b,#8 ++ rev64 v19.4s,v19.4s ++ ext v19.16b,v19.16b,v19.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ eor v16.16b,v16.16b,v24.16b ++ eor v17.16b,v17.16b,v25.16b ++ eor v18.16b,v18.16b,v26.16b ++ eor v19.16b,v19.16b,v27.16b ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++ subs x2,x2,#4 ++ b.eq 3f ++ add w5,w5,#1 ++ mov v8.s[3],w5 ++ b 1b ++1: ++ subs x2,x2,#1 ++ b.lt 3f ++ mov v16.16b,v8.16b ++ ld1 {v24.4s},[x0],#16 ++ sm4e v16.4s,v0.4s; ++ sm4e v16.4s,v1.4s; ++ sm4e v16.4s,v2.4s; ++ sm4e v16.4s,v3.4s; ++ sm4e v16.4s,v4.4s; ++ sm4e v16.4s,v5.4s; ++ sm4e v16.4s,v6.4s; ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4s,v16.4s ++ ext v16.16b,v16.16b,v16.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++ eor v16.16b,v16.16b,v24.16b ++ st1 {v16.4s},[x1],#16 ++ b.eq 3f ++ add w5,w5,#1 ++ mov v8.s[3],w5 ++ b 1b ++3: ++ ldp d8,d9,[sp],#16 ++ ret ++.size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks +diff --git a/v1/wd.c b/v1/wd.c +index 26e7af3..4286bbe 100644 +--- a/v1/wd.c ++++ b/v1/wd.c +@@ -88,7 +88,8 @@ static int get_raw_attr(const char *dev_root, const char *attr, + if (ptrRet == NULL) + return -WD_ENODEV; + +- /* The attr_file = "/sys/class/uacce/xxx" ++ /* ++ * The attr_file = "/sys/class/uacce/xxx" + * It's the Internal Definition File Node + */ + fd = open(attr_path, O_RDONLY, 0); +diff --git a/v1/wd_rng.c b/v1/wd_rng.c +index 24a4b7a..7a89cd1 100644 +--- a/v1/wd_rng.c ++++ b/v1/wd_rng.c +@@ -57,7 +57,7 @@ static int wcrypto_setup_qinfo(struct wcrypto_rng_ctx_setup *setup, + WD_ERR("algorithm mismatch!\n"); + return ret; + } +- qinfo = q->qinfo; ++ qinfo = q->qinfo; + /* lock at ctx creating */ + wd_spinlock(&qinfo->qlock); + if (qinfo->ctx_num >= WD_MAX_CTX_NUM) { +@@ -120,7 +120,7 @@ void *wcrypto_create_rng_ctx(struct wd_queue *q, + return ctx; + + free_ctx_id: +- qinfo = q->qinfo; ++ qinfo = q->qinfo; + wd_spinlock(&qinfo->qlock); + qinfo->ctx_num--; + wd_free_id(qinfo->ctx_id, WD_MAX_CTX_NUM, ctx_id, WD_MAX_CTX_NUM); +diff --git a/wd_cipher.c b/wd_cipher.c +index f35ce6f..63ec362 100644 +--- a/wd_cipher.c ++++ b/wd_cipher.c +@@ -622,10 +622,10 @@ static int send_recv_sync(struct wd_ctx_internal *ctx, + msg_handle.send = wd_cipher_setting.driver->send; + msg_handle.recv = wd_cipher_setting.driver->recv; + +- pthread_spin_lock(&ctx->lock); ++ wd_ctx_spin_lock(ctx, wd_cipher_setting.driver->calc_type); + ret = wd_handle_msg_sync(wd_cipher_setting.driver, &msg_handle, ctx->ctx, + msg, NULL, wd_cipher_setting.config.epoll_en); +- pthread_spin_unlock(&ctx->lock); ++ wd_ctx_spin_unlock(ctx, wd_cipher_setting.driver->calc_type); + + return ret; + } +-- +2.25.1 + diff --git a/0026-cipher-add-support-for-SM4-CFB-and-XTS-modes-in-CE-i.patch b/0026-cipher-add-support-for-SM4-CFB-and-XTS-modes-in-CE-i.patch new file mode 100644 index 0000000..a28822a --- /dev/null +++ b/0026-cipher-add-support-for-SM4-CFB-and-XTS-modes-in-CE-i.patch @@ -0,0 +1,1348 @@ +From 091bbf55057370ab571d8a84cc33465ad145e1a9 Mon Sep 17 00:00:00 2001 +From: Yuzeng Zhuang +Date: Wed, 20 Mar 2024 16:12:48 +0800 +Subject: [PATCH 26/44] cipher: add support for SM4 CFB and XTS modes in CE + instruction + +This patch implements the CE instruction using SM4 CFB and XTS modes. + +Signed-off-by: Yuzeng Zhuang +Signed-off-by: Qi Tao +--- + drv/isa_ce_sm4.c | 115 +++- + drv/isa_ce_sm4.h | 14 + + drv/isa_ce_sm4_armv8.S | 1126 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 1253 insertions(+), 2 deletions(-) + +diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c +index e2d81de..466b060 100644 +--- a/drv/isa_ce_sm4.c ++++ b/drv/isa_ce_sm4.c +@@ -22,6 +22,8 @@ + #define SM4_BLOCK_SIZE 16 + #define MAX_BLOCK_NUM (1U << 28) + #define CTR96_SHIFT_BITS 8 ++#define SM4_BYTES2BLKS(nbytes) ((nbytes) >> 4) ++#define SM4_KEY_SIZE 16 + + #define GETU32(p) \ + ((__u32)(p)[0] << 24 | (__u32)(p)[1] << 16 | (__u32)(p)[2] << 8 | (__u32)(p)[3]) +@@ -136,10 +138,104 @@ void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) + sm4_v8_set_decrypt_key(userKey, key); + } + ++static void sm4_cfb_crypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey, const int enc) ++{ ++ unsigned char keydata[SM4_BLOCK_SIZE]; ++ const unsigned char *src = msg->in; ++ unsigned char *dst = msg->out; ++ __u32 nbytes = msg->in_bytes; ++ __u32 blocks, bbytes; ++ __u32 i = 0; ++ ++ blocks = SM4_BYTES2BLKS(nbytes); ++ if (blocks) { ++ if (enc == SM4_ENCRYPT) ++ sm4_v8_cfb_encrypt_blocks(src, dst, blocks, rkey, msg->iv); ++ else ++ sm4_v8_cfb_decrypt_blocks(src, dst, blocks, rkey, msg->iv); ++ ++ bbytes = blocks * SM4_BLOCK_SIZE; ++ dst += bbytes; ++ src += bbytes; ++ nbytes -= bbytes; ++ } ++ ++ if (nbytes == 0) ++ return; ++ ++ sm4_v8_crypt_block(msg->iv, keydata, rkey); ++ while (nbytes > 0) { ++ *dst++ = *src++ ^ keydata[i++]; ++ nbytes--; ++ } ++ ++ /* store new IV */ ++ if (enc == SM4_ENCRYPT) { ++ if (msg->out_bytes >= msg->iv_bytes) ++ memcpy(msg->iv, msg->out + msg->out_bytes - ++ msg->iv_bytes, msg->iv_bytes); ++ else ++ memcpy(msg->iv, msg->out, msg->out_bytes); ++ } else { ++ if (msg->in_bytes >= msg->iv_bytes) ++ memcpy(msg->iv, msg->in + msg->in_bytes - ++ msg->iv_bytes, msg->iv_bytes); ++ else ++ memcpy(msg->iv, msg->in, msg->in_bytes); ++ } ++} ++ ++static void sm4_cfb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) ++{ ++ sm4_cfb_crypt(msg, rkey_enc, SM4_ENCRYPT); ++} ++ ++static void sm4_cfb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) ++{ ++ sm4_cfb_crypt(msg, rkey_dec, SM4_DECRYPT); ++} ++ ++static int sm4_xts_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey) ++{ ++ struct SM4_KEY rkey2; ++ ++ if (msg->in_bytes < SM4_BLOCK_SIZE) { ++ WD_ERR("invalid: cipher input length is wrong!\n"); ++ return -WD_EINVAL; ++ } ++ ++ /* set key for tweak */ ++ sm4_set_encrypt_key(msg->key + SM4_KEY_SIZE, &rkey2); ++ ++ sm4_v8_xts_encrypt(msg->in, msg->out, msg->in_bytes, ++ rkey, msg->iv, &rkey2); ++ ++ return 0; ++} ++ ++static int sm4_xts_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey) ++{ ++ struct SM4_KEY rkey2; ++ ++ if (msg->in_bytes < SM4_BLOCK_SIZE) { ++ WD_ERR("invalid: cipher input length is wrong!\n"); ++ return -WD_EINVAL; ++ } ++ ++ /* set key for tweak */ ++ sm4_set_encrypt_key(msg->key + SM4_KEY_SIZE, &rkey2); ++ ++ sm4_v8_xts_decrypt(msg->in, msg->out, msg->in_bytes, ++ rkey, msg->iv, &rkey2); ++ ++ return 0; ++} ++ + static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_msg) + { + struct wd_cipher_msg *msg = wd_msg; + struct SM4_KEY rkey; ++ int ret = 0; + + if (!msg) { + WD_ERR("invalid: input sm4 msg is NULL!\n"); +@@ -151,7 +247,8 @@ static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_ + return -WD_EINVAL; + } + +- if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR) ++ if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR ++ || msg->mode == WD_CIPHER_CFB) + sm4_set_encrypt_key(msg->key, &rkey); + else + sm4_set_decrypt_key(msg->key, &rkey); +@@ -166,12 +263,24 @@ static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_ + case WD_CIPHER_CTR: + sm4_ctr_encrypt(msg, &rkey); + break; ++ case WD_CIPHER_CFB: ++ if (msg->op_type == WD_CIPHER_ENCRYPTION) ++ sm4_cfb_encrypt(msg, &rkey); ++ else ++ sm4_cfb_decrypt(msg, &rkey); ++ break; ++ case WD_CIPHER_XTS: ++ if (msg->op_type == WD_CIPHER_ENCRYPTION) ++ ret = sm4_xts_encrypt(msg, &rkey); ++ else ++ ret = sm4_xts_decrypt(msg, &rkey); ++ break; + default: + WD_ERR("The current block cipher mode is not supported!\n"); + return -WD_EINVAL; + } + +- return 0; ++ return ret; + } + + static int isa_ce_cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *wd_msg) +@@ -206,6 +315,8 @@ static int cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg) + static struct wd_alg_driver cipher_alg_driver[] = { + GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), + GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("xts(sm4)", cipher), + }; + + static void __attribute__((constructor)) isa_ce_probe(void) +diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h +index 0bc074d..d92069f 100644 +--- a/drv/isa_ce_sm4.h ++++ b/drv/isa_ce_sm4.h +@@ -31,6 +31,20 @@ void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, + void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); + ++void sm4_v8_cfb_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t length, const struct SM4_KEY *key, unsigned char *ivec); ++void sm4_v8_cfb_decrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t length, const struct SM4_KEY *key, unsigned char *ivec); ++void sm4_v8_crypt_block(const unsigned char *in, unsigned char *out, ++ const struct SM4_KEY *key); ++ ++int sm4_v8_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, ++ const struct SM4_KEY *key, unsigned char *ivec, ++ const struct SM4_KEY *key2); ++int sm4_v8_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, ++ const struct SM4_KEY *key, unsigned char *ivec, ++ const struct SM4_KEY *key2); ++ + #ifdef __cplusplus + } + #endif +diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S +index d7d172a..342dfa5 100644 +--- a/drv/isa_ce_sm4_armv8.S ++++ b/drv/isa_ce_sm4_armv8.S +@@ -37,6 +37,14 @@ + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 + .Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc ++.align 4 ++.cts_permute_table: ++.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff ++.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff ++.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 ++.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf ++.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff ++.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .globl sm4_v8_set_encrypt_key + .type sm4_v8_set_encrypt_key,%function + .align 5 +@@ -772,3 +780,1121 @@ sm4_v8_ctr32_encrypt_blocks: + ldp d8,d9,[sp],#16 + ret + .size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks ++ ++.globl sm4_v8_crypt_block ++.type sm4_v8_crypt_block,%function ++.align 5 ++sm4_v8_crypt_block: ++ /* parameters: ++ * x0: src ++ * x1: dst ++ * x2: key ++ */ ++ AARCH64_VALID_CALL_TARGET ++ ++ ld1 {v0.16b-v3.16b}, [x2], #64 ++ ld1 {v4.16b-v7.16b}, [x2] ++ ++ ld1 {v16.4s},[x0] ++ ++ rev32 v16.16b, v16.16b ++ sm4e v16.4s, v0.4s ++ sm4e v16.4s, v1.4s ++ sm4e v16.4s, v2.4s ++ sm4e v16.4s, v3.4s ++ sm4e v16.4s, v4.4s ++ sm4e v16.4s, v5.4s ++ sm4e v16.4s, v6.4s ++ sm4e v16.4s, v7.4s ++ rev64 v16.4s, v16.4s ++ ext v16.16b, v16.16b, v16.16b, #8 ++ rev32 v16.16b, v16.16b ++ ++ st1 {v16.16b}, [x1]; ++ ++ ret ++.size sm4_v8_crypt_block,.-sm4_v8_crypt_block ++ ++.globl sm4_v8_cfb_encrypt_blocks ++.type sm4_v8_cfb_encrypt_blocks,%function ++.align 5 ++sm4_v8_cfb_encrypt_blocks: ++ /* parameters: ++ * x0: src ++ * x1: dst ++ * w2: nblocks ++ * x3: key ++ * x4: iv ++ */ ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {v0.4s-v3.4s}, [x3], #64 ++ ld1 {v4.4s-v7.4s}, [x3] ++ ++ ld1 {v8.4s},[x4] ++ ++.loop_cfb_enc_4block: ++ cmp w2, #4 ++ blt .loob_cfb_enc_1block ++ ++ sub w2, w2, #4 ++ ++ ld1 {v16.4s-v19.4s}, [x0], #64 ++ ++ rev32 v8.16b, v8.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ eor v16.16b, v16.16b, v8.16b ++ ++ rev32 v8.16b, v16.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ eor v17.16b, v17.16b, v8.16b ++ ++ rev32 v8.16b, v17.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ eor v18.16b, v18.16b, v8.16b ++ ++ rev32 v8.16b, v18.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ eor v19.16b, v19.16b, v8.16b ++ ++ st1 {v16.4s-v19.4s}, [x1], #64 ++ mov v8.16b, v19.16b ++ ++ cbz w2, .end_cfb_enc ++ b .loop_cfb_enc_4block ++ ++.loob_cfb_enc_1block: ++ sub w2, w2, #1 ++ ++ ld1 {v16.4s}, [x0], #16 ++ ++ rev32 v8.16b, v8.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ eor v8.16b, v8.16b, v16.16b ++ ++ st1 {v8.4s}, [x1], #16 ++ ++ cbnz w2, .loob_cfb_enc_1block ++ ++.end_cfb_enc: ++ st1 {v8.4s}, [x4] ++ ++ ldp d8,d9,[sp],#16 ++ ret ++.size sm4_v8_cfb_encrypt_blocks,.-sm4_v8_cfb_encrypt_blocks ++ ++.globl sm4_v8_cfb_decrypt_blocks ++.type sm4_v8_cfb_decrypt_blocks,%function ++.align 5 ++sm4_v8_cfb_decrypt_blocks: ++ /* parameters: ++ * x0: src ++ * x1: dst ++ * w2: nblocks ++ * x3: key ++ * x4: iv ++ */ ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {v0.4s-v3.4s}, [x3], #64 ++ ld1 {v4.4s-v7.4s}, [x3] ++ ++ ld1 {v8.4s},[x4] ++ ++.loop_cfb_dec_8block: ++ cmp w2, #8 ++ blt .cfb_dec_4block ++ ++ sub w2, w2, #8 ++ ++ ld1 {v12.4s-v15.4s}, [x0], #64 ++ ld1 {v16.4s-v19.4s}, [x0], #64 ++ ++ rev32 v20.16b, v8.16b ++ rev32 v21.16b, v12.16b ++ rev32 v22.16b, v13.16b ++ rev32 v23.16b, v14.16b ++ rev32 v24.16b, v15.16b ++ rev32 v25.16b, v16.16b ++ rev32 v26.16b, v17.16b ++ rev32 v27.16b, v18.16b ++ sm4e v20.4s, v0.4s ++ sm4e v21.4s, v0.4s ++ sm4e v22.4s, v0.4s ++ sm4e v23.4s, v0.4s ++ sm4e v24.4s, v0.4s ++ sm4e v25.4s, v0.4s ++ sm4e v26.4s, v0.4s ++ sm4e v27.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v21.4s, v1.4s ++ sm4e v22.4s, v1.4s ++ sm4e v23.4s, v1.4s ++ sm4e v24.4s, v1.4s ++ sm4e v25.4s, v1.4s ++ sm4e v26.4s, v1.4s ++ sm4e v27.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v21.4s, v2.4s ++ sm4e v22.4s, v2.4s ++ sm4e v23.4s, v2.4s ++ sm4e v24.4s, v2.4s ++ sm4e v25.4s, v2.4s ++ sm4e v26.4s, v2.4s ++ sm4e v27.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v21.4s, v3.4s ++ sm4e v22.4s, v3.4s ++ sm4e v23.4s, v3.4s ++ sm4e v24.4s, v3.4s ++ sm4e v25.4s, v3.4s ++ sm4e v26.4s, v3.4s ++ sm4e v27.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v21.4s, v4.4s ++ sm4e v22.4s, v4.4s ++ sm4e v23.4s, v4.4s ++ sm4e v24.4s, v4.4s ++ sm4e v25.4s, v4.4s ++ sm4e v26.4s, v4.4s ++ sm4e v27.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v21.4s, v5.4s ++ sm4e v22.4s, v5.4s ++ sm4e v23.4s, v5.4s ++ sm4e v24.4s, v5.4s ++ sm4e v25.4s, v5.4s ++ sm4e v26.4s, v5.4s ++ sm4e v27.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v21.4s, v6.4s ++ sm4e v22.4s, v6.4s ++ sm4e v23.4s, v6.4s ++ sm4e v24.4s, v6.4s ++ sm4e v25.4s, v6.4s ++ sm4e v26.4s, v6.4s ++ sm4e v27.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ sm4e v21.4s, v7.4s ++ sm4e v22.4s, v7.4s ++ sm4e v23.4s, v7.4s ++ sm4e v24.4s, v7.4s ++ sm4e v25.4s, v7.4s ++ sm4e v26.4s, v7.4s ++ sm4e v27.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ rev64 v21.4s, v21.4s ++ rev64 v22.4s, v22.4s ++ rev64 v23.4s, v23.4s ++ rev64 v24.4s, v24.4s ++ rev64 v25.4s, v25.4s ++ rev64 v26.4s, v26.4s ++ rev64 v27.4s, v27.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ ext v21.16b, v21.16b, v21.16b, #8 ++ ext v22.16b, v22.16b, v22.16b, #8 ++ ext v23.16b, v23.16b, v23.16b, #8 ++ ext v24.16b, v24.16b, v24.16b, #8 ++ ext v25.16b, v25.16b, v25.16b, #8 ++ ext v26.16b, v26.16b, v26.16b, #8 ++ ext v27.16b, v27.16b, v27.16b, #8 ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ rev32 v24.16b, v24.16b ++ rev32 v25.16b, v25.16b ++ rev32 v26.16b, v26.16b ++ rev32 v27.16b, v27.16b ++ ++ mov v8.16b, v19.16b //Modify IV ++ ++ eor v20.16b, v20.16b, v12.16b ++ eor v21.16b, v21.16b, v13.16b ++ eor v22.16b, v22.16b, v14.16b ++ eor v23.16b, v23.16b, v15.16b ++ eor v24.16b, v24.16b, v16.16b ++ eor v25.16b, v25.16b, v17.16b ++ eor v26.16b, v26.16b, v18.16b ++ eor v27.16b, v27.16b, v19.16b ++ ++ st1 {v20.4s-v23.4s}, [x1], #64 ++ st1 {v24.4s-v27.4s}, [x1], #64 ++ ++ cbz w2, .end_cfb_dec ++ b .loop_cfb_dec_8block ++ ++.cfb_dec_4block: ++ cmp w2, #4 ++ blt .loop_cfb_dec_1block ++ ++ sub w2, w2, #4 ++ ++ ld1 {v12.4s-v15.4s}, [x0], #64 ++ ++ rev32 v20.16b, v8.16b ++ rev32 v21.16b, v12.16b ++ rev32 v22.16b, v13.16b ++ rev32 v23.16b, v14.16b ++ sm4e v20.4s, v0.4s ++ sm4e v21.4s, v0.4s ++ sm4e v22.4s, v0.4s ++ sm4e v23.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v21.4s, v1.4s ++ sm4e v22.4s, v1.4s ++ sm4e v23.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v21.4s, v2.4s ++ sm4e v22.4s, v2.4s ++ sm4e v23.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v21.4s, v3.4s ++ sm4e v22.4s, v3.4s ++ sm4e v23.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v21.4s, v4.4s ++ sm4e v22.4s, v4.4s ++ sm4e v23.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v21.4s, v5.4s ++ sm4e v22.4s, v5.4s ++ sm4e v23.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v21.4s, v6.4s ++ sm4e v22.4s, v6.4s ++ sm4e v23.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ sm4e v21.4s, v7.4s ++ sm4e v22.4s, v7.4s ++ sm4e v23.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ rev64 v21.4s, v21.4s ++ rev64 v22.4s, v22.4s ++ rev64 v23.4s, v23.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ ext v21.16b, v21.16b, v21.16b, #8 ++ ext v22.16b, v22.16b, v22.16b, #8 ++ ext v23.16b, v23.16b, v23.16b, #8 ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ ++ mov v8.16b, v15.16b //Modify IV ++ ++ eor v20.16b, v20.16b, v12.16b ++ eor v21.16b, v21.16b, v13.16b ++ eor v22.16b, v22.16b, v14.16b ++ eor v23.16b, v23.16b, v15.16b ++ ++ st1 {v20.4s-v23.4s}, [x1], #64 ++ ++ cbz w2, .end_cfb_dec ++ ++.loop_cfb_dec_1block: ++ sub w2, w2, #1 ++ ++ ld1 {v12.4s}, [x0], #16 ++ ++ rev32 v20.16b, v8.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ ++ eor v20.16b, v20.16b, v12.16b ++ st1 {v20.4s}, [x1], #16 ++ ++ mov v8.16b, v12.16b //Modify IV ++ ++ cbnz w2, .loop_cfb_dec_1block ++ ++.end_cfb_dec: ++ /* store new IV */ ++ st1 {v8.4s}, [x4] ++ ++ ldp d8,d9,[sp],#16 ++ ret ++.size sm4_v8_cfb_decrypt_blocks,.-sm4_v8_cfb_decrypt_blocks ++ ++#define tweak_calc(out, in, MSK, TMP) \ ++ sshr TMP.2d, in.2d, #63; \ ++ and TMP.16b, TMP.16b, MSK.16b; \ ++ add out.2d, in.2d, in.2d; \ ++ ext TMP.16b, TMP.16b, TMP.16b, #8; \ ++ eor out.16b, out.16b, TMP.16b; ++ ++.globl sm4_v8_xts_encrypt ++.type sm4_v8_xts_encrypt,%function ++.align 5 ++sm4_v8_xts_encrypt: ++ /* parameters: ++ * x0: src ++ * x1: dst ++ * w2: nbytes ++ * x3: key ++ * x4: tweak ++ * x5: key array for tweak ++ */ ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {v8.16b}, [x4] ++ ++ cbz x5, .enc_xts_nokey2 ++ ++ /* load round key array for tweak */ ++ ld1 {v0.16b-v3.16b}, [x5], #64 ++ ld1 {v4.16b-v7.16b}, [x5] ++ ++ /* first tweak */ ++ rev32 v8.16b, v8.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ ++.enc_xts_nokey2: ++ /* load key array */ ++ ld1 {v0.16b-v3.16b}, [x3], #64 ++ ld1 {v4.16b-v7.16b}, [x3] ++ ++ and w5, w2, #15 ++ lsr w2, w2, #4 ++ cbz w5, .enc_xts_mask ++ /* leave the last block for tail */ ++ sub w2, w2, #1 ++ ++.enc_xts_mask: ++ /* init mask */ ++ movi v31.2s, #0x1 ++ movi v16.2s, #0x87 ++ uzp1 v31.4s, v31.4s, v16.4s ++ ++ cbz w2, .enc_xts_tail ++ ++.enc_xts_8block: ++ sub w2, w2, #8 ++ tbnz w2, #31, .enc_xts_4block ++ ++ tweak_calc(v9, v8, v31, v16) ++ tweak_calc(v10, v9, v31, v17) ++ tweak_calc(v11, v10, v31, v18) ++ tweak_calc(v12, v11, v31, v19) ++ tweak_calc(v13, v12, v31, v16) ++ tweak_calc(v14, v13, v31, v17) ++ tweak_calc(v15, v14, v31, v18) ++ ++ ld1 {v20.16b-v23.16b}, [x0], #64 ++ ld1 {v24.16b-v27.16b}, [x0], #64 ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ eor v24.16b, v24.16b, v12.16b ++ eor v25.16b, v25.16b, v13.16b ++ eor v26.16b, v26.16b, v14.16b ++ eor v27.16b, v27.16b, v15.16b ++ ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ rev32 v24.16b, v24.16b ++ rev32 v25.16b, v25.16b ++ rev32 v26.16b, v26.16b ++ rev32 v27.16b, v27.16b ++ sm4e v20.4s, v0.4s ++ sm4e v21.4s, v0.4s ++ sm4e v22.4s, v0.4s ++ sm4e v23.4s, v0.4s ++ sm4e v24.4s, v0.4s ++ sm4e v25.4s, v0.4s ++ sm4e v26.4s, v0.4s ++ sm4e v27.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v21.4s, v1.4s ++ sm4e v22.4s, v1.4s ++ sm4e v23.4s, v1.4s ++ sm4e v24.4s, v1.4s ++ sm4e v25.4s, v1.4s ++ sm4e v26.4s, v1.4s ++ sm4e v27.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v21.4s, v2.4s ++ sm4e v22.4s, v2.4s ++ sm4e v23.4s, v2.4s ++ sm4e v24.4s, v2.4s ++ sm4e v25.4s, v2.4s ++ sm4e v26.4s, v2.4s ++ sm4e v27.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v21.4s, v3.4s ++ sm4e v22.4s, v3.4s ++ sm4e v23.4s, v3.4s ++ sm4e v24.4s, v3.4s ++ sm4e v25.4s, v3.4s ++ sm4e v26.4s, v3.4s ++ sm4e v27.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v21.4s, v4.4s ++ sm4e v22.4s, v4.4s ++ sm4e v23.4s, v4.4s ++ sm4e v24.4s, v4.4s ++ sm4e v25.4s, v4.4s ++ sm4e v26.4s, v4.4s ++ sm4e v27.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v21.4s, v5.4s ++ sm4e v22.4s, v5.4s ++ sm4e v23.4s, v5.4s ++ sm4e v24.4s, v5.4s ++ sm4e v25.4s, v5.4s ++ sm4e v26.4s, v5.4s ++ sm4e v27.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v21.4s, v6.4s ++ sm4e v22.4s, v6.4s ++ sm4e v23.4s, v6.4s ++ sm4e v24.4s, v6.4s ++ sm4e v25.4s, v6.4s ++ sm4e v26.4s, v6.4s ++ sm4e v27.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ sm4e v21.4s, v7.4s ++ sm4e v22.4s, v7.4s ++ sm4e v23.4s, v7.4s ++ sm4e v24.4s, v7.4s ++ sm4e v25.4s, v7.4s ++ sm4e v26.4s, v7.4s ++ sm4e v27.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ rev64 v21.4s, v21.4s ++ rev64 v22.4s, v22.4s ++ rev64 v23.4s, v23.4s ++ rev64 v24.4s, v24.4s ++ rev64 v25.4s, v25.4s ++ rev64 v26.4s, v26.4s ++ rev64 v27.4s, v27.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ ext v21.16b, v21.16b, v21.16b, #8 ++ ext v22.16b, v22.16b, v22.16b, #8 ++ ext v23.16b, v23.16b, v23.16b, #8 ++ ext v24.16b, v24.16b, v24.16b, #8 ++ ext v25.16b, v25.16b, v25.16b, #8 ++ ext v26.16b, v26.16b, v26.16b, #8 ++ ext v27.16b, v27.16b, v27.16b, #8 ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ rev32 v24.16b, v24.16b ++ rev32 v25.16b, v25.16b ++ rev32 v26.16b, v26.16b ++ rev32 v27.16b, v27.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ eor v24.16b, v24.16b, v12.16b ++ eor v25.16b, v25.16b, v13.16b ++ eor v26.16b, v26.16b, v14.16b ++ eor v27.16b, v27.16b, v15.16b ++ st1 {v20.16b-v23.16b}, [x1], #64 ++ st1 {v24.16b-v27.16b}, [x1], #64 ++ ++ tweak_calc(v8, v15, v31, v19) ++ ++ cbz w2, .enc_xts_tail ++ b .enc_xts_8block ++ ++.enc_xts_4block: ++ add w2, w2, #8 ++ cmp w2, #4 ++ blt .enc_xts_1block ++ ++ sub w2, w2, #4 ++ ++ tweak_calc(v9, v8, v31, v16) ++ tweak_calc(v10, v9, v31, v17) ++ tweak_calc(v11, v10, v31, v18) ++ ++ ld1 {v20.16b-v23.16b}, [x0], #64 ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ sm4e v20.4s, v0.4s ++ sm4e v21.4s, v0.4s ++ sm4e v22.4s, v0.4s ++ sm4e v23.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v21.4s, v1.4s ++ sm4e v22.4s, v1.4s ++ sm4e v23.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v21.4s, v2.4s ++ sm4e v22.4s, v2.4s ++ sm4e v23.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v21.4s, v3.4s ++ sm4e v22.4s, v3.4s ++ sm4e v23.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v21.4s, v4.4s ++ sm4e v22.4s, v4.4s ++ sm4e v23.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v21.4s, v5.4s ++ sm4e v22.4s, v5.4s ++ sm4e v23.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v21.4s, v6.4s ++ sm4e v22.4s, v6.4s ++ sm4e v23.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ sm4e v21.4s, v7.4s ++ sm4e v22.4s, v7.4s ++ sm4e v23.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ rev64 v21.4s, v21.4s ++ rev64 v22.4s, v22.4s ++ rev64 v23.4s, v23.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ ext v21.16b, v21.16b, v21.16b, #8 ++ ext v22.16b, v22.16b, v22.16b, #8 ++ ext v23.16b, v23.16b, v23.16b, #8 ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ st1 {v20.16b-v23.16b}, [x1], #64 ++ ++ tweak_calc(v8, v11, v31, v19) ++ ++ cbz w2, .enc_xts_tail ++ ++.enc_xts_1block: ++ sub w2, w2, #1 ++ ++ ld1 {v20.16b}, [x0], #16 ++ eor v20.16b, v20.16b, v8.16b ++ ++ rev32 v20.16b, v20.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ st1 {v20.16b}, [x1], #16 ++ ++ tweak_calc(v8, v8, v31, v16) ++ ++ cbnz w2, .enc_xts_1block ++ ++.enc_xts_tail: ++ uxtw x5, w5 ++ cbz x5, .enc_xts_end ++ ++ tweak_calc(v9, v8, v31, v16) ++ ld1 {v20.16b}, [x0] ++ eor v20.16b, v20.16b, v8.16b ++ rev32 v20.16b, v20.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ eor v20.16b, v20.16b, v8.16b ++ ++ adr x6, .cts_permute_table ++ add x7, x6, #32 ++ add x6, x6, x5 ++ sub x7, x7, x5 ++ ld1 {v23.16b}, [x6] ++ ld1 {v24.16b}, [x7] ++ ++ add x0, x0, x5 ++ ld1 {v21.16b}, [x0] ++ ++ tbl v22.16b, {v20.16b}, v23.16b ++ tbx v20.16b, {v21.16b}, v24.16b ++ ++ eor v20.16b, v20.16b, v9.16b ++ rev32 v20.16b, v20.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ eor v20.16b, v20.16b, v9.16b ++ ++ add x5, x1, x5 ++ st1 {v22.16b}, [x5] ++ st1 {v20.16b}, [x1] ++ ++ b .enc_xts_ret ++ ++.enc_xts_end: ++ /* new tweak */ ++ st1 {v8.16b}, [x4] ++ ++.enc_xts_ret: ++ ldp d8,d9,[sp],#16 ++ ret ++.size sm4_v8_xts_encrypt,.-sm4_v8_xts_encrypt ++ ++.globl sm4_v8_xts_decrypt ++.type sm4_v8_xts_decrypt,%function ++.align 5 ++sm4_v8_xts_decrypt: ++ /* parameters: ++ * x0: src ++ * x1: dst ++ * w2: nbytes ++ * x3: key ++ * x4: tweak ++ * x5: key array for tweak ++ */ ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {v8.16b}, [x4] ++ ++ cbz x5, .dec_xts_nokey2 ++ ++ /* load round key array for tweak */ ++ ld1 {v0.16b-v3.16b}, [x5], #64 ++ ld1 {v4.16b-v7.16b}, [x5] ++ ++ /* first tweak */ ++ rev32 v8.16b, v8.16b ++ sm4e v8.4s, v0.4s ++ sm4e v8.4s, v1.4s ++ sm4e v8.4s, v2.4s ++ sm4e v8.4s, v3.4s ++ sm4e v8.4s, v4.4s ++ sm4e v8.4s, v5.4s ++ sm4e v8.4s, v6.4s ++ sm4e v8.4s, v7.4s ++ rev64 v8.4s, v8.4s ++ ext v8.16b, v8.16b, v8.16b, #8 ++ rev32 v8.16b, v8.16b ++ ++.dec_xts_nokey2: ++ ld1 {v0.16b-v3.16b}, [x3], #64 ++ ld1 {v4.16b-v7.16b}, [x3] ++ ++ and w5, w2, #15 ++ lsr w2, w2, #4 ++ cbz w5, .dec_xts_mask ++ /* leave the last block for tail */ ++ sub w2, w2, #1 ++ ++.dec_xts_mask: ++ /* init mask */ ++ movi v31.2s, #0x1 ++ movi v16.2s, #0x87 ++ uzp1 v31.4s, v31.4s, v16.4s ++ ++ cbz w2, .dec_xts_tail ++ ++.dec_xts_8block: ++ sub w2, w2, #8 ++ tbnz w2, #31, .dec_xts_4block ++ ++ tweak_calc(v9, v8, v31, v16) ++ tweak_calc(v10, v9, v31, v17) ++ tweak_calc(v11, v10, v31, v18) ++ tweak_calc(v12, v11, v31, v19) ++ tweak_calc(v13, v12, v31, v16) ++ tweak_calc(v14, v13, v31, v17) ++ tweak_calc(v15, v14, v31, v18) ++ ++ ld1 {v20.16b-v23.16b}, [x0], #64 ++ ld1 {v24.16b-v27.16b}, [x0], #64 ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ eor v24.16b, v24.16b, v12.16b ++ eor v25.16b, v25.16b, v13.16b ++ eor v26.16b, v26.16b, v14.16b ++ eor v27.16b, v27.16b, v15.16b ++ ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ rev32 v24.16b, v24.16b ++ rev32 v25.16b, v25.16b ++ rev32 v26.16b, v26.16b ++ rev32 v27.16b, v27.16b ++ sm4e v20.4s, v0.4s ++ sm4e v21.4s, v0.4s ++ sm4e v22.4s, v0.4s ++ sm4e v23.4s, v0.4s ++ sm4e v24.4s, v0.4s ++ sm4e v25.4s, v0.4s ++ sm4e v26.4s, v0.4s ++ sm4e v27.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v21.4s, v1.4s ++ sm4e v22.4s, v1.4s ++ sm4e v23.4s, v1.4s ++ sm4e v24.4s, v1.4s ++ sm4e v25.4s, v1.4s ++ sm4e v26.4s, v1.4s ++ sm4e v27.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v21.4s, v2.4s ++ sm4e v22.4s, v2.4s ++ sm4e v23.4s, v2.4s ++ sm4e v24.4s, v2.4s ++ sm4e v25.4s, v2.4s ++ sm4e v26.4s, v2.4s ++ sm4e v27.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v21.4s, v3.4s ++ sm4e v22.4s, v3.4s ++ sm4e v23.4s, v3.4s ++ sm4e v24.4s, v3.4s ++ sm4e v25.4s, v3.4s ++ sm4e v26.4s, v3.4s ++ sm4e v27.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v21.4s, v4.4s ++ sm4e v22.4s, v4.4s ++ sm4e v23.4s, v4.4s ++ sm4e v24.4s, v4.4s ++ sm4e v25.4s, v4.4s ++ sm4e v26.4s, v4.4s ++ sm4e v27.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v21.4s, v5.4s ++ sm4e v22.4s, v5.4s ++ sm4e v23.4s, v5.4s ++ sm4e v24.4s, v5.4s ++ sm4e v25.4s, v5.4s ++ sm4e v26.4s, v5.4s ++ sm4e v27.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v21.4s, v6.4s ++ sm4e v22.4s, v6.4s ++ sm4e v23.4s, v6.4s ++ sm4e v24.4s, v6.4s ++ sm4e v25.4s, v6.4s ++ sm4e v26.4s, v6.4s ++ sm4e v27.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ sm4e v21.4s, v7.4s ++ sm4e v22.4s, v7.4s ++ sm4e v23.4s, v7.4s ++ sm4e v24.4s, v7.4s ++ sm4e v25.4s, v7.4s ++ sm4e v26.4s, v7.4s ++ sm4e v27.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ rev64 v21.4s, v21.4s ++ rev64 v22.4s, v22.4s ++ rev64 v23.4s, v23.4s ++ rev64 v24.4s, v24.4s ++ rev64 v25.4s, v25.4s ++ rev64 v26.4s, v26.4s ++ rev64 v27.4s, v27.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ ext v21.16b, v21.16b, v21.16b, #8 ++ ext v22.16b, v22.16b, v22.16b, #8 ++ ext v23.16b, v23.16b, v23.16b, #8 ++ ext v24.16b, v24.16b, v24.16b, #8 ++ ext v25.16b, v25.16b, v25.16b, #8 ++ ext v26.16b, v26.16b, v26.16b, #8 ++ ext v27.16b, v27.16b, v27.16b, #8 ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ rev32 v24.16b, v24.16b ++ rev32 v25.16b, v25.16b ++ rev32 v26.16b, v26.16b ++ rev32 v27.16b, v27.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ eor v24.16b, v24.16b, v12.16b ++ eor v25.16b, v25.16b, v13.16b ++ eor v26.16b, v26.16b, v14.16b ++ eor v27.16b, v27.16b, v15.16b ++ st1 {v20.16b-v23.16b}, [x1], #64 ++ st1 {v24.16b-v27.16b}, [x1], #64 ++ ++ tweak_calc(v8, v15, v31, v19) ++ ++ cbz w2, .dec_xts_tail ++ b .dec_xts_8block ++ ++.dec_xts_4block: ++ add w2, w2, #8 ++ cmp w2, #4 ++ blt .dec_xts_1block ++ ++ sub w2, w2, #4 ++ ++ tweak_calc(v9, v8, v31, v16) ++ tweak_calc(v10, v9, v31, v17) ++ tweak_calc(v11, v10, v31, v18) ++ ++ ld1 {v20.16b-v23.16b}, [x0], #64 ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ sm4e v20.4s, v0.4s ++ sm4e v21.4s, v0.4s ++ sm4e v22.4s, v0.4s ++ sm4e v23.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v21.4s, v1.4s ++ sm4e v22.4s, v1.4s ++ sm4e v23.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v21.4s, v2.4s ++ sm4e v22.4s, v2.4s ++ sm4e v23.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v21.4s, v3.4s ++ sm4e v22.4s, v3.4s ++ sm4e v23.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v21.4s, v4.4s ++ sm4e v22.4s, v4.4s ++ sm4e v23.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v21.4s, v5.4s ++ sm4e v22.4s, v5.4s ++ sm4e v23.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v21.4s, v6.4s ++ sm4e v22.4s, v6.4s ++ sm4e v23.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ sm4e v21.4s, v7.4s ++ sm4e v22.4s, v7.4s ++ sm4e v23.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ rev64 v21.4s, v21.4s ++ rev64 v22.4s, v22.4s ++ rev64 v23.4s, v23.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ ext v21.16b, v21.16b, v21.16b, #8 ++ ext v22.16b, v22.16b, v22.16b, #8 ++ ext v23.16b, v23.16b, v23.16b, #8 ++ rev32 v20.16b, v20.16b ++ rev32 v21.16b, v21.16b ++ rev32 v22.16b, v22.16b ++ rev32 v23.16b, v23.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ eor v21.16b, v21.16b, v9.16b ++ eor v22.16b, v22.16b, v10.16b ++ eor v23.16b, v23.16b, v11.16b ++ st1 {v20.16b-v23.16b}, [x1], #64 ++ ++ tweak_calc(v8, v11, v31, v19) ++ ++ cbz w2, .dec_xts_tail ++ ++.dec_xts_1block: ++ sub w2, w2, #1 ++ ++ ld1 {v20.16b}, [x0], #16 ++ eor v20.16b, v20.16b, v8.16b ++ ++ rev32 v20.16b, v20.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ st1 {v20.16b}, [x1], #16 ++ ++ tweak_calc(v8, v8, v31, v16) ++ ++ cbnz w2, .dec_xts_1block ++ ++.dec_xts_tail: ++ uxtw x5, w5 ++ cbz x5, .dec_xts_end ++ ++ tweak_calc(v9, v8, v31, v16) ++ ld1 {v20.16b}, [x0] ++ eor v20.16b, v20.16b, v9.16b ++ rev32 v20.16b, v20.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ eor v20.16b, v20.16b, v9.16b ++ ++ adr x6, .cts_permute_table ++ add x7, x6, #32 ++ add x6, x6, x5 ++ sub x7, x7, x5 ++ ld1 {v23.16b}, [x6] ++ ld1 {v24.16b}, [x7] ++ ++ add x0, x0, x5 ++ ld1 {v21.16b}, [x0] ++ ++ tbl v22.16b, {v20.16b}, v23.16b ++ tbx v20.16b, {v21.16b}, v24.16b ++ ++ eor v20.16b, v20.16b, v8.16b ++ rev32 v20.16b, v20.16b ++ sm4e v20.4s, v0.4s ++ sm4e v20.4s, v1.4s ++ sm4e v20.4s, v2.4s ++ sm4e v20.4s, v3.4s ++ sm4e v20.4s, v4.4s ++ sm4e v20.4s, v5.4s ++ sm4e v20.4s, v6.4s ++ sm4e v20.4s, v7.4s ++ rev64 v20.4s, v20.4s ++ ext v20.16b, v20.16b, v20.16b, #8 ++ rev32 v20.16b, v20.16b ++ eor v20.16b, v20.16b, v8.16b ++ ++ add x5, x1, x5 ++ st1 {v22.16b}, [x5] ++ st1 {v20.16b}, [x1] ++ ++ b .dec_xts_ret ++ ++.dec_xts_end: ++ /* new tweak */ ++ st1 {v8.16b}, [x4] ++ ++.dec_xts_ret: ++ ldp d8,d9,[sp],#16 ++ ret ++.size sm4_v8_xts_decrypt,.-sm4_v8_xts_decrypt +-- +2.25.1 + diff --git a/0027-cipher-add-support-for-SM4-ECB-algorithm-in-CE-instr.patch b/0027-cipher-add-support-for-SM4-ECB-algorithm-in-CE-instr.patch new file mode 100644 index 0000000..2deb528 --- /dev/null +++ b/0027-cipher-add-support-for-SM4-ECB-algorithm-in-CE-instr.patch @@ -0,0 +1,348 @@ +From 6e66b445df0d39b9e796d1a4afcbe617197278de Mon Sep 17 00:00:00 2001 +From: Qi Tao +Date: Wed, 20 Mar 2024 16:13:45 +0800 +Subject: [PATCH 27/44] cipher: add support for SM4(ECB) algorithm in CE + instruction + +Provides the CE acceleration instruction (Crypto-Extension) +to accelerate the execution of the SM4(ECB) algorithm. + +Signed-off-by: Qi Tao +--- + drv/isa_ce_sm4.c | 17 +++ + drv/isa_ce_sm4.h | 2 + + drv/isa_ce_sm4_armv8.S | 263 +++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 282 insertions(+) + +diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c +index 466b060..ccab8fb 100644 +--- a/drv/isa_ce_sm4.c ++++ b/drv/isa_ce_sm4.c +@@ -128,6 +128,16 @@ static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke + sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); + } + ++static void sm4_ecb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) ++{ ++ sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, SM4_ENCRYPT); ++} ++ ++static void sm4_ecb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec) ++{ ++ sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, SM4_DECRYPT); ++} ++ + void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) + { + sm4_v8_set_encrypt_key(userKey, key); +@@ -254,6 +264,12 @@ static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_ + sm4_set_decrypt_key(msg->key, &rkey); + + switch (msg->mode) { ++ case WD_CIPHER_ECB: ++ if (msg->op_type == WD_CIPHER_ENCRYPTION) ++ sm4_ecb_encrypt(msg, &rkey); ++ else ++ sm4_ecb_decrypt(msg, &rkey); ++ break; + case WD_CIPHER_CBC: + if (msg->op_type == WD_CIPHER_ENCRYPTION) + sm4_cbc_encrypt(msg, &rkey); +@@ -317,6 +333,7 @@ static struct wd_alg_driver cipher_alg_driver[] = { + GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), + GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), + GEN_CE_ALG_DRIVER("xts(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("ecb(sm4)", cipher), + }; + + static void __attribute__((constructor)) isa_ce_probe(void) +diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h +index d92069f..d10b0af 100644 +--- a/drv/isa_ce_sm4.h ++++ b/drv/isa_ce_sm4.h +@@ -28,6 +28,8 @@ void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); + void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, + unsigned char *ivec, const int enc); ++void sm4_v8_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const struct SM4_KEY *key, const int enc); + void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, const unsigned char ivec[16]); + +diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S +index 342dfa5..7d84496 100644 +--- a/drv/isa_ce_sm4_armv8.S ++++ b/drv/isa_ce_sm4_armv8.S +@@ -506,6 +506,269 @@ sm4_v8_cbc_encrypt: + ldp d8,d9,[sp],#16 + ret + .size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt ++.globl sm4_v8_ecb_encrypt ++.type sm4_v8_ecb_encrypt,%function ++.align 5 ++sm4_v8_ecb_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x3],#64 ++ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x3] ++1: ++ cmp x2,#64 ++ b.lt 1f ++ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x0],#64 ++ cmp x2,#128 ++ b.lt 2f ++ ld1 {v20.4s,v21.4s,v22.4s,v23.4s},[x0],#64 ++ // 8 blocks ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v20.16b,v20.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v21.16b,v21.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v22.16b,v22.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v23.16b,v23.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v17.4s,v0.4s; ++ sm4e v18.4s,v0.4s; ++ sm4e v19.4s,v0.4s; ++ ++ sm4e v16.4s,v1.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v19.4s,v1.4s; ++ ++ sm4e v16.4s,v2.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v19.4s,v2.4s; ++ ++ sm4e v16.4s,v3.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v19.4s,v3.4s; ++ ++ sm4e v16.4s,v4.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v19.4s,v4.4s; ++ ++ sm4e v16.4s,v5.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v19.4s,v5.4s; ++ ++ sm4e v16.4s,v6.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v19.4s,v6.4s; ++ ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4S,v16.4S ++ sm4e v17.4s,v7.4s; ++ ext v16.16b,v16.16b,v16.16b,#8 ++ rev64 v17.4S,v17.4S ++ sm4e v18.4s,v7.4s; ++ ext v17.16b,v17.16b,v17.16b,#8 ++ rev64 v18.4S,v18.4S ++ sm4e v19.4s,v7.4s; ++ ext v18.16b,v18.16b,v18.16b,#8 ++ rev64 v19.4S,v19.4S ++ ext v19.16b,v19.16b,v19.16b,#8 ++ sm4e v20.4s,v0.4s; ++ sm4e v21.4s,v0.4s; ++ sm4e v22.4s,v0.4s; ++ sm4e v23.4s,v0.4s; ++ ++ sm4e v20.4s,v1.4s; ++ sm4e v21.4s,v1.4s; ++ sm4e v22.4s,v1.4s; ++ sm4e v23.4s,v1.4s; ++ ++ sm4e v20.4s,v2.4s; ++ sm4e v21.4s,v2.4s; ++ sm4e v22.4s,v2.4s; ++ sm4e v23.4s,v2.4s; ++ ++ sm4e v20.4s,v3.4s; ++ sm4e v21.4s,v3.4s; ++ sm4e v22.4s,v3.4s; ++ sm4e v23.4s,v3.4s; ++ ++ sm4e v20.4s,v4.4s; ++ sm4e v21.4s,v4.4s; ++ sm4e v22.4s,v4.4s; ++ sm4e v23.4s,v4.4s; ++ ++ sm4e v20.4s,v5.4s; ++ sm4e v21.4s,v5.4s; ++ sm4e v22.4s,v5.4s; ++ sm4e v23.4s,v5.4s; ++ ++ sm4e v20.4s,v6.4s; ++ sm4e v21.4s,v6.4s; ++ sm4e v22.4s,v6.4s; ++ sm4e v23.4s,v6.4s; ++ ++ sm4e v20.4s,v7.4s; ++ rev64 v20.4S,v20.4S ++ sm4e v21.4s,v7.4s; ++ ext v20.16b,v20.16b,v20.16b,#8 ++ rev64 v21.4S,v21.4S ++ sm4e v22.4s,v7.4s; ++ ext v21.16b,v21.16b,v21.16b,#8 ++ rev64 v22.4S,v22.4S ++ sm4e v23.4s,v7.4s; ++ ext v22.16b,v22.16b,v22.16b,#8 ++ rev64 v23.4S,v23.4S ++ ext v23.16b,v23.16b,v23.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v20.16b,v20.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v21.16b,v21.16b ++#endif ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++#ifndef __ARMEB__ ++ rev32 v22.16b,v22.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v23.16b,v23.16b ++#endif ++ st1 {v20.4s,v21.4s,v22.4s,v23.4s},[x1],#64 ++ subs x2,x2,#128 ++ b.gt 1b ++ ret ++ // 4 blocks ++2: ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v17.4s,v0.4s; ++ sm4e v18.4s,v0.4s; ++ sm4e v19.4s,v0.4s; ++ ++ sm4e v16.4s,v1.4s; ++ sm4e v17.4s,v1.4s; ++ sm4e v18.4s,v1.4s; ++ sm4e v19.4s,v1.4s; ++ ++ sm4e v16.4s,v2.4s; ++ sm4e v17.4s,v2.4s; ++ sm4e v18.4s,v2.4s; ++ sm4e v19.4s,v2.4s; ++ ++ sm4e v16.4s,v3.4s; ++ sm4e v17.4s,v3.4s; ++ sm4e v18.4s,v3.4s; ++ sm4e v19.4s,v3.4s; ++ ++ sm4e v16.4s,v4.4s; ++ sm4e v17.4s,v4.4s; ++ sm4e v18.4s,v4.4s; ++ sm4e v19.4s,v4.4s; ++ ++ sm4e v16.4s,v5.4s; ++ sm4e v17.4s,v5.4s; ++ sm4e v18.4s,v5.4s; ++ sm4e v19.4s,v5.4s; ++ ++ sm4e v16.4s,v6.4s; ++ sm4e v17.4s,v6.4s; ++ sm4e v18.4s,v6.4s; ++ sm4e v19.4s,v6.4s; ++ ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4S,v16.4S ++ sm4e v17.4s,v7.4s; ++ ext v16.16b,v16.16b,v16.16b,#8 ++ rev64 v17.4S,v17.4S ++ sm4e v18.4s,v7.4s; ++ ext v17.16b,v17.16b,v17.16b,#8 ++ rev64 v18.4S,v18.4S ++ sm4e v19.4s,v7.4s; ++ ext v18.16b,v18.16b,v18.16b,#8 ++ rev64 v19.4S,v19.4S ++ ext v19.16b,v19.16b,v19.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v17.16b,v17.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v18.16b,v18.16b ++#endif ++#ifndef __ARMEB__ ++ rev32 v19.16b,v19.16b ++#endif ++ st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x1],#64 ++ subs x2,x2,#64 ++ b.gt 1b ++1: ++ subs x2,x2,#16 ++ b.lt 1f ++ ld1 {v16.4s},[x0],#16 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++ sm4e v16.4s,v0.4s; ++ sm4e v16.4s,v1.4s; ++ sm4e v16.4s,v2.4s; ++ sm4e v16.4s,v3.4s; ++ sm4e v16.4s,v4.4s; ++ sm4e v16.4s,v5.4s; ++ sm4e v16.4s,v6.4s; ++ sm4e v16.4s,v7.4s; ++ rev64 v16.4S,v16.4S ++ ext v16.16b,v16.16b,v16.16b,#8 ++#ifndef __ARMEB__ ++ rev32 v16.16b,v16.16b ++#endif ++ st1 {v16.4s},[x1],#16 ++ b.ne 1b ++1: ++ ret ++.size sm4_v8_ecb_encrypt,.-sm4_v8_ecb_encrypt + .globl sm4_v8_ctr32_encrypt_blocks + .type sm4_v8_ctr32_encrypt_blocks,%function + .align 5 +-- +2.25.1 + diff --git a/0028-uadk-cipher-isa_ce-support-SM4-cbc_cts-mode.patch b/0028-uadk-cipher-isa_ce-support-SM4-cbc_cts-mode.patch new file mode 100644 index 0000000..3ef0b90 --- /dev/null +++ b/0028-uadk-cipher-isa_ce-support-SM4-cbc_cts-mode.patch @@ -0,0 +1,337 @@ +From 8c23969dacd7b1ae1b77c1118a8f895bec6fd165 Mon Sep 17 00:00:00 2001 +From: Yang Shen +Date: Wed, 20 Mar 2024 16:15:00 +0800 +Subject: [PATCH 28/44] uadk/cipher: isa_ce - support SM4 cbc_cts mode + +This patch implements the CE instruction using SM4 CBC_CTS modes. + +Signed-off-by: Yang Shen +Signed-off-by: Qi Tao +--- + drv/isa_ce_sm4.c | 91 +++++++++++++++++++++++++++- + drv/isa_ce_sm4.h | 24 +++++--- + drv/isa_ce_sm4_armv8.S | 133 +++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 238 insertions(+), 10 deletions(-) + +diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c +index ccab8fb..6961471 100644 +--- a/drv/isa_ce_sm4.c ++++ b/drv/isa_ce_sm4.c +@@ -128,6 +128,82 @@ static void sm4_cbc_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke + sm4_v8_cbc_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, msg->iv, SM4_DECRYPT); + } + ++/* ++ * In some situations, the cts mode can use cbc mode instead to imporve performance. ++ */ ++static int sm4_cts_cbc_instead(struct wd_cipher_msg *msg) ++{ ++ if (msg->in_bytes == SM4_BLOCK_SIZE) ++ return true; ++ ++ if (!(msg->in_bytes % SM4_BLOCK_SIZE) && msg->mode != WD_CIPHER_CBC_CS3) ++ return true; ++ ++ return false; ++} ++ ++static void sm4_cts_cs1_mode_adapt(__u8 *cts_in, __u8 *cts_out, ++ const __u32 cts_bytes, const int enc) ++{ ++ __u32 rsv_bytes = cts_bytes % SM4_BLOCK_SIZE; ++ __u8 blocks[SM4_BLOCK_SIZE] = {0}; ++ ++ if (enc == SM4_ENCRYPT) { ++ memcpy(blocks, cts_out, SM4_BLOCK_SIZE); ++ memcpy(cts_out, cts_out + SM4_BLOCK_SIZE, rsv_bytes); ++ memcpy(cts_out + rsv_bytes, blocks, SM4_BLOCK_SIZE); ++ } else { ++ memcpy(blocks, cts_in + rsv_bytes, SM4_BLOCK_SIZE); ++ memcpy(cts_in + SM4_BLOCK_SIZE, cts_in, rsv_bytes); ++ memcpy(cts_in, blocks, SM4_BLOCK_SIZE); ++ } ++} ++ ++static void sm4_cts_cbc_crypt(struct wd_cipher_msg *msg, ++ const struct SM4_KEY *rkey_enc, const int enc) ++{ ++ enum wd_cipher_mode mode = msg->mode; ++ __u32 in_bytes = msg->in_bytes; ++ __u8 *cts_in, *cts_out; ++ __u32 cts_bytes; ++ ++ if (sm4_cts_cbc_instead(msg)) ++ return sm4_v8_cbc_encrypt(msg->in, msg->out, in_bytes, rkey_enc, msg->iv, enc); ++ ++ cts_bytes = in_bytes % SM4_BLOCK_SIZE + SM4_BLOCK_SIZE; ++ if (cts_bytes == SM4_BLOCK_SIZE) ++ cts_bytes += SM4_BLOCK_SIZE; ++ ++ in_bytes -= cts_bytes; ++ if (in_bytes) ++ sm4_v8_cbc_encrypt(msg->in, msg->out, in_bytes, rkey_enc, msg->iv, enc); ++ ++ cts_in = msg->in + in_bytes; ++ cts_out = msg->out + in_bytes; ++ ++ if (enc == SM4_ENCRYPT) { ++ sm4_v8_cbc_cts_encrypt(cts_in, cts_out, cts_bytes, rkey_enc, msg->iv); ++ ++ if (mode == WD_CIPHER_CBC_CS1) ++ sm4_cts_cs1_mode_adapt(cts_in, cts_out, cts_bytes, enc); ++ } else { ++ if (mode == WD_CIPHER_CBC_CS1) ++ sm4_cts_cs1_mode_adapt(cts_in, cts_out, cts_bytes, enc); ++ ++ sm4_v8_cbc_cts_decrypt(cts_in, cts_out, cts_bytes, rkey_enc, msg->iv); ++ } ++} ++ ++static void sm4_cbc_cts_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) ++{ ++ sm4_cts_cbc_crypt(msg, rkey_enc, SM4_ENCRYPT); ++} ++ ++static void sm4_cbc_cts_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) ++{ ++ sm4_cts_cbc_crypt(msg, rkey_enc, SM4_DECRYPT); ++} ++ + static void sm4_ecb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc) + { + sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_enc, SM4_ENCRYPT); +@@ -138,12 +214,12 @@ static void sm4_ecb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rke + sm4_v8_ecb_encrypt(msg->in, msg->out, msg->in_bytes, rkey_dec, SM4_DECRYPT); + } + +-void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) ++static void sm4_set_encrypt_key(const __u8 *userKey, struct SM4_KEY *key) + { + sm4_v8_set_encrypt_key(userKey, key); + } + +-void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) ++static void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key) + { + sm4_v8_set_decrypt_key(userKey, key); + } +@@ -276,6 +352,14 @@ static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_ + else + sm4_cbc_decrypt(msg, &rkey); + break; ++ case WD_CIPHER_CBC_CS1: ++ case WD_CIPHER_CBC_CS2: ++ case WD_CIPHER_CBC_CS3: ++ if (msg->op_type == WD_CIPHER_ENCRYPTION) ++ sm4_cbc_cts_encrypt(msg, &rkey); ++ else ++ sm4_cbc_cts_decrypt(msg, &rkey); ++ break; + case WD_CIPHER_CTR: + sm4_ctr_encrypt(msg, &rkey); + break; +@@ -330,6 +414,9 @@ static int cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg) + + static struct wd_alg_driver cipher_alg_driver[] = { + GEN_CE_ALG_DRIVER("cbc(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("cbc-cs1(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("cbc-cs2(sm4)", cipher), ++ GEN_CE_ALG_DRIVER("cbc-cs3(sm4)", cipher), + GEN_CE_ALG_DRIVER("ctr(sm4)", cipher), + GEN_CE_ALG_DRIVER("cfb(sm4)", cipher), + GEN_CE_ALG_DRIVER("xts(sm4)", cipher), +diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h +index d10b0af..308619e 100644 +--- a/drv/isa_ce_sm4.h ++++ b/drv/isa_ce_sm4.h +@@ -25,27 +25,35 @@ struct sm4_ce_drv_ctx { + + void sm4_v8_set_encrypt_key(const unsigned char *userKey, struct SM4_KEY *key); + void sm4_v8_set_decrypt_key(const unsigned char *userKey, struct SM4_KEY *key); ++ + void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, + unsigned char *ivec, const int enc); ++void sm4_v8_cbc_cts_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, const unsigned char ivec[16]); ++void sm4_v8_cbc_cts_decrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, const unsigned char ivec[16]); ++ + void sm4_v8_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const struct SM4_KEY *key, const int enc); ++ + void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, +- size_t len, const void *key, const unsigned char ivec[16]); ++ size_t len, const void *key, const unsigned char ivec[16]); + + void sm4_v8_cfb_encrypt_blocks(const unsigned char *in, unsigned char *out, +- size_t length, const struct SM4_KEY *key, unsigned char *ivec); ++ size_t length, const struct SM4_KEY *key, unsigned char *ivec); + void sm4_v8_cfb_decrypt_blocks(const unsigned char *in, unsigned char *out, +- size_t length, const struct SM4_KEY *key, unsigned char *ivec); ++ size_t length, const struct SM4_KEY *key, unsigned char *ivec); ++ + void sm4_v8_crypt_block(const unsigned char *in, unsigned char *out, +- const struct SM4_KEY *key); ++ const struct SM4_KEY *key); + + int sm4_v8_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, +- const struct SM4_KEY *key, unsigned char *ivec, +- const struct SM4_KEY *key2); ++ const struct SM4_KEY *key, unsigned char *ivec, ++ const struct SM4_KEY *key2); + int sm4_v8_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, +- const struct SM4_KEY *key, unsigned char *ivec, +- const struct SM4_KEY *key2); ++ const struct SM4_KEY *key, unsigned char *ivec, ++ const struct SM4_KEY *key2); + + #ifdef __cplusplus + } +diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S +index 7d84496..6ebf39b 100644 +--- a/drv/isa_ce_sm4_armv8.S ++++ b/drv/isa_ce_sm4_armv8.S +@@ -506,6 +506,139 @@ sm4_v8_cbc_encrypt: + ldp d8,d9,[sp],#16 + ret + .size sm4_v8_cbc_encrypt,.-sm4_v8_cbc_encrypt ++ ++.globl sm4_v8_cbc_cts_encrypt ++.type sm4_v8_cbc_cts_encrypt,%function ++.align 5 ++sm4_v8_cbc_cts_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64 ++ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3] ++ sub x5, x2, #16 ++ ++ ld1 {v8.4s}, [x4] ++ ++ ld1 {v10.4s}, [x0] ++ eor v8.16b, v8.16b, v10.16b ++ rev32 v8.16b, v8.16b; ++ sm4e v8.4s, v0.4s; ++ sm4e v8.4s, v1.4s; ++ sm4e v8.4s, v2.4s; ++ sm4e v8.4s, v3.4s; ++ sm4e v8.4s, v4.4s; ++ sm4e v8.4s, v5.4s; ++ sm4e v8.4s, v6.4s; ++ sm4e v8.4s, v7.4s; ++ rev64 v8.4s, v8.4s; ++ ext v8.16b, v8.16b, v8.16b, #8; ++ rev32 v8.16b, v8.16b; ++ ++ /* load permute table */ ++ adr x6, .cts_permute_table ++ add x7, x6, #32 ++ add x6, x6, x5 ++ sub x7, x7, x5 ++ ld1 {v13.4s}, [x6] ++ ld1 {v14.4s}, [x7] ++ ++ /* overlapping loads */ ++ add x0, x0, x5 ++ ld1 {v11.4s}, [x0] ++ ++ /* create Cn from En-1 */ ++ tbl v10.16b, {v8.16b}, v13.16b ++ /* padding Pn with zeros */ ++ tbl v11.16b, {v11.16b}, v14.16b ++ ++ eor v11.16b, v11.16b, v8.16b ++ rev32 v11.16b, v11.16b; ++ sm4e v11.4s, v0.4s; ++ sm4e v11.4s, v1.4s; ++ sm4e v11.4s, v2.4s; ++ sm4e v11.4s, v3.4s; ++ sm4e v11.4s, v4.4s; ++ sm4e v11.4s, v5.4s; ++ sm4e v11.4s, v6.4s; ++ sm4e v11.4s, v7.4s; ++ rev64 v11.4s, v11.4s; ++ ext v11.16b, v11.16b, v11.16b, #8; ++ rev32 v11.16b, v11.16b; ++ ++ /* overlapping stores */ ++ add x5, x1, x5 ++ st1 {v10.16b}, [x5] ++ st1 {v11.16b}, [x1] ++ ++ ret ++.size sm4_v8_cbc_cts_encrypt,.-sm4_v8_cbc_cts_encrypt ++ ++.globl sm4_v8_cbc_cts_decrypt ++.type sm4_v8_cbc_cts_decrypt,%function ++.align 5 ++sm4_v8_cbc_cts_decrypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x3], #64 ++ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x3] ++ ++ sub x5, x2, #16 ++ ++ ld1 {v8.4s}, [x4] ++ ++ /* load permute table */ ++ adr x6, .cts_permute_table ++ add x7, x6, #32 ++ add x6, x6, x5 ++ sub x7, x7, x5 ++ ld1 {v13.4s}, [x6] ++ ld1 {v14.4s}, [x7] ++ ++ /* overlapping loads */ ++ ld1 {v10.16b}, [x0], x5 ++ ld1 {v11.16b}, [x0] ++ ++ rev32 v10.16b, v10.16b; ++ sm4e v10.4s, v0.4s; ++ sm4e v10.4s, v1.4s; ++ sm4e v10.4s, v2.4s; ++ sm4e v10.4s, v3.4s; ++ sm4e v10.4s, v4.4s; ++ sm4e v10.4s, v5.4s; ++ sm4e v10.4s, v6.4s; ++ sm4e v10.4s, v7.4s; ++ rev64 v10.4s, v10.4s; ++ ext v10.16b, v10.16b, v10.16b, #8; ++ rev32 v10.16b, v10.16b; ++ ++ /* select the first Ln bytes of Xn to create Pn */ ++ tbl v12.16b, {v10.16b}, v13.16b ++ eor v12.16b, v12.16b, v11.16b ++ ++ /* overwrite the first Ln bytes with Cn to create En-1 */ ++ tbx v10.16b, {v11.16b}, v14.16b ++ ++ rev32 v10.16b, v10.16b; ++ sm4e v10.4s, v0.4s; ++ sm4e v10.4s, v1.4s; ++ sm4e v10.4s, v2.4s; ++ sm4e v10.4s, v3.4s; ++ sm4e v10.4s, v4.4s; ++ sm4e v10.4s, v5.4s; ++ sm4e v10.4s, v6.4s; ++ sm4e v10.4s, v7.4s; ++ rev64 v10.4s, v10.4s; ++ ext v10.16b, v10.16b, v10.16b, #8; ++ rev32 v10.16b, v10.16b; ++ ++ eor v10.16b, v10.16b, v8.16b ++ ++ /* overlapping stores */ ++ add x5, x1, x5 ++ st1 {v12.16b}, [x5] ++ st1 {v10.16b}, [x1] ++ ++ ret ++.size sm4_v8_cbc_cts_decrypt,.-sm4_v8_cbc_cts_decrypt ++ + .globl sm4_v8_ecb_encrypt + .type sm4_v8_ecb_encrypt,%function + .align 5 +-- +2.25.1 + diff --git a/0029-uadk-wd_alg-check-whether-the-platform-supports-SVE.patch b/0029-uadk-wd_alg-check-whether-the-platform-supports-SVE.patch new file mode 100644 index 0000000..0bf5c92 --- /dev/null +++ b/0029-uadk-wd_alg-check-whether-the-platform-supports-SVE.patch @@ -0,0 +1,49 @@ +From 4163f4f6ff8534b171c8b1b044452b930629576d Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Sat, 23 Mar 2024 17:54:16 +0800 +Subject: [PATCH 29/44] uadk/wd_alg: check whether the platform supports SVE + +If the algorithm uses the SVE instruction, check whether +the platform supports SVE before algorithm driver registration. +If the platform does not support SVE, do not register the algorithm. + +Signed-off-by: Weili Qian +--- + wd_alg.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/wd_alg.c b/wd_alg.c +index f34a407..de352d7 100644 +--- a/wd_alg.c ++++ b/wd_alg.c +@@ -109,6 +109,19 @@ static bool wd_check_ce_support(const char *dev_name) + return false; + } + ++static bool wd_check_sve_support(void) ++{ ++ unsigned long hwcaps = 0; ++ ++ #if defined(__aarch64__) ++ hwcaps = getauxval(AT_HWCAP); ++ #endif ++ if (hwcaps & HWCAP_SVE) ++ return true; ++ ++ return false; ++} ++ + static bool wd_alg_check_available(int calc_type, const char *dev_name) + { + bool ret = false; +@@ -122,6 +135,7 @@ static bool wd_alg_check_available(int calc_type, const char *dev_name) + break; + /* Should find the CPU if not support SVE */ + case UADK_ALG_SVE_INSTR: ++ ret = wd_check_sve_support(); + break; + /* Check if the current driver has device support */ + case UADK_ALG_HW: +-- +2.25.1 + diff --git a/0030-uadk-sched-fix-async-mode-ctx-id.patch b/0030-uadk-sched-fix-async-mode-ctx-id.patch new file mode 100644 index 0000000..021a896 --- /dev/null +++ b/0030-uadk-sched-fix-async-mode-ctx-id.patch @@ -0,0 +1,34 @@ +From 6ab956dc04c04849d2650e08d59b9722522eb201 Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Sat, 23 Mar 2024 17:56:17 +0800 +Subject: [PATCH 30/44] uadk/sched: fix async mode ctx id + +In the single scheduler scenario, ctx id 1 is asynchronous ctx, +but the function sched_single_poll_policy() uses ctx id 0. +As a result, packets fail to be received. Change the value of +ctx id to 1. + +Signed-off-by: Weili Qian +--- + wd_sched.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/wd_sched.c b/wd_sched.c +index b43834d..6766872 100644 +--- a/wd_sched.c ++++ b/wd_sched.c +@@ -428,9 +428,9 @@ static int sched_single_poll_policy(handle_t h_sched_ctx, + } + + while (loop_times > 0) { +- /* Default async mode use ctx 0 */ ++ /* Default async mode use ctx 1 */ + loop_times--; +- ret = sched_ctx->poll_func(0, 1, &poll_num); ++ ret = sched_ctx->poll_func(1, 1, &poll_num); + if ((ret < 0) && (ret != -EAGAIN)) + return ret; + else if (ret == -EAGAIN) +-- +2.25.1 + diff --git a/0031-uadk-initializes-ctx-resources-in-SVE-mode.patch b/0031-uadk-initializes-ctx-resources-in-SVE-mode.patch new file mode 100644 index 0000000..d0fdc9f --- /dev/null +++ b/0031-uadk-initializes-ctx-resources-in-SVE-mode.patch @@ -0,0 +1,169 @@ +From 4516c0f35532713548f4ccd016c562359c713eb4 Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Sat, 23 Mar 2024 17:57:12 +0800 +Subject: [PATCH 31/44] uadk: initializes ctx resources in SVE mode + +Initializes ctx resources in SVE mode. In addition, when the driver +is released, the config resources need to be released in all modes, +not only UADK_ALG_HW. + +Signed-off-by: Weili Qian +--- + include/wd_alg_common.h | 4 ++ + wd_util.c | 95 +++++++++++++++++++++++++++++++++++------ + 2 files changed, 85 insertions(+), 14 deletions(-) + +diff --git a/include/wd_alg_common.h b/include/wd_alg_common.h +index 32b8630..5fee085 100644 +--- a/include/wd_alg_common.h ++++ b/include/wd_alg_common.h +@@ -127,6 +127,10 @@ struct wd_ctx_params { + struct wd_cap_config *cap; + }; + ++struct wd_soft_ctx { ++ void *priv; ++}; ++ + struct wd_ctx_internal { + handle_t ctx; + __u8 op_type; +diff --git a/wd_util.c b/wd_util.c +index fb58167..1e2b190 100644 +--- a/wd_util.c ++++ b/wd_util.c +@@ -28,6 +28,10 @@ + #define US2S(us) ((us) >> 20) + #define WD_INIT_RETRY_TIMEOUT 3 + ++#define WD_SOFT_CTX_NUM 2 ++#define WD_SOFT_SYNC_CTX 0 ++#define WD_SOFT_ASYNC_CTX 1 ++ + #define WD_DRV_LIB_DIR "uadk" + + struct msg_pool { +@@ -1968,8 +1972,7 @@ void wd_alg_uninit_driver(struct wd_ctx_config_internal *config, + + driver->exit(driver); + /* Ctx config just need clear once */ +- if (driver->calc_type == UADK_ALG_HW) +- wd_clear_ctx_config(config); ++ wd_clear_ctx_config(config); + + if (driver->fallback) + wd_alg_uninit_fallback((struct wd_alg_driver *)driver->fallback); +@@ -2660,6 +2663,47 @@ static void wd_alg_ctx_uninit(struct wd_ctx_config *ctx_config) + free(ctx_config->ctxs); + } + ++static int wd_alg_init_sve_ctx(struct wd_ctx_config *ctx_config) ++{ ++ struct wd_soft_ctx *ctx_sync, *ctx_async; ++ ++ ctx_config->ctx_num = WD_SOFT_CTX_NUM; ++ ctx_config->ctxs = calloc(ctx_config->ctx_num, sizeof(struct wd_ctx)); ++ if (!ctx_config->ctxs) ++ return -WD_ENOMEM; ++ ++ ctx_sync = calloc(1, sizeof(struct wd_soft_ctx)); ++ if (!ctx_sync) ++ goto free_ctxs; ++ ++ ctx_config->ctxs[WD_SOFT_SYNC_CTX].op_type = 0; ++ ctx_config->ctxs[WD_SOFT_SYNC_CTX].ctx_mode = CTX_MODE_SYNC; ++ ctx_config->ctxs[WD_SOFT_SYNC_CTX].ctx = (handle_t)ctx_sync; ++ ++ ctx_async = calloc(1, sizeof(struct wd_soft_ctx)); ++ if (!ctx_async) ++ goto free_ctx_sync; ++ ++ ctx_config->ctxs[WD_SOFT_ASYNC_CTX].op_type = 0; ++ ctx_config->ctxs[WD_SOFT_ASYNC_CTX].ctx_mode = CTX_MODE_ASYNC; ++ ctx_config->ctxs[WD_SOFT_ASYNC_CTX].ctx = (handle_t)ctx_async; ++ ++ return 0; ++ ++free_ctx_sync: ++ free(ctx_sync); ++free_ctxs: ++ free(ctx_config->ctxs); ++ return -WD_ENOMEM; ++} ++ ++static void wd_alg_uninit_sve_ctx(struct wd_ctx_config *ctx_config) ++{ ++ free((struct wd_soft_ctx *)ctx_config->ctxs[WD_SOFT_ASYNC_CTX].ctx); ++ free((struct wd_soft_ctx *)ctx_config->ctxs[WD_SOFT_SYNC_CTX].ctx); ++ free(ctx_config->ctxs); ++} ++ + int wd_alg_attrs_init(struct wd_init_attrs *attrs) + { + wd_alg_poll_ctx alg_poll_func = attrs->alg_poll_ctx; +@@ -2717,9 +2761,23 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) + } + attrs->sched = alg_sched; + +- ret = wd_sched_rr_instance(alg_sched, NULL); ++ ctx_config = calloc(1, sizeof(*ctx_config)); ++ if (!ctx_config) { ++ WD_ERR("fail to alloc ctx config\n"); ++ goto out_freesched; ++ } ++ attrs->ctx_config = ctx_config; ++ ++ ret = wd_alg_init_sve_ctx(ctx_config); + if (ret) { +- WD_ERR("fail to instance scheduler\n"); ++ WD_ERR("fail to init sve ctx!\n"); ++ goto out_freesched; ++ } ++ ++ ctx_config->cap = attrs->ctx_params->cap; ++ ret = alg_init_func(ctx_config, alg_sched); ++ if (ret) { ++ wd_alg_uninit_sve_ctx(ctx_config); + goto out_freesched; + } + break; +@@ -2780,17 +2838,26 @@ void wd_alg_attrs_uninit(struct wd_init_attrs *attrs) + struct wd_sched *alg_sched = attrs->sched; + int driver_type = attrs->driver->calc_type; + +- if (driver_type == UADK_ALG_CE_INSTR || driver_type == UADK_ALG_SOFT) { +- if (ctx_config) { +- wd_alg_ce_ctx_uninit(ctx_config); +- free(ctx_config); +- } +- } else { +- if (ctx_config) { +- wd_alg_ctx_uninit(ctx_config); +- free(ctx_config); +- } ++ if (!ctx_config) { ++ wd_sched_rr_release(alg_sched); ++ return; ++ } ++ ++ switch (driver_type) { ++ case UADK_ALG_SOFT: ++ case UADK_ALG_CE_INSTR: ++ wd_alg_ce_ctx_uninit(ctx_config); ++ break; ++ case UADK_ALG_SVE_INSTR: ++ wd_alg_uninit_sve_ctx(ctx_config); ++ break; ++ case UADK_ALG_HW: ++ wd_alg_ctx_uninit(ctx_config); ++ break; ++ default: ++ break; + } + ++ free(ctx_config); + wd_sched_rr_release(alg_sched); + } +-- +2.25.1 + diff --git a/0032-uadk-hash_mb-support-multi-buffer-calculation-for-sm.patch b/0032-uadk-hash_mb-support-multi-buffer-calculation-for-sm.patch new file mode 100644 index 0000000..7089827 --- /dev/null +++ b/0032-uadk-hash_mb-support-multi-buffer-calculation-for-sm.patch @@ -0,0 +1,4092 @@ +From 5dbc53c96ac4efcf26b4dbcdbbf55d1b5e7a06be Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Sat, 23 Mar 2024 18:00:43 +0800 +Subject: [PATCH 32/44] uadk/hash_mb: support multi-buffer calculation for sm3 + and md5 + +Supports sm3 and md5 multi-buffer calculation by using SVE instructions. +If the platform supports SVE instructions, uesrs can choose SVE instructions +to perform sm3 and md5 algorithm calculation. + +The assembly implementation is from isa-l_crypto: +https://github.com/intel/isa-l_crypto.git + +Signed-off-by: Weili Qian +--- + Makefile.am | 15 +- + drv/hash_mb/hash_mb.c | 843 ++++++++++++++++++++++++++++++++++ + drv/hash_mb/hash_mb.h | 62 +++ + drv/hash_mb/md5_mb_asimd_x1.S | 248 ++++++++++ + drv/hash_mb/md5_mb_asimd_x4.S | 526 +++++++++++++++++++++ + drv/hash_mb/md5_mb_sve.S | 158 +++++++ + drv/hash_mb/md5_sve_common.S | 478 +++++++++++++++++++ + drv/hash_mb/sm3_mb_asimd_x1.S | 387 ++++++++++++++++ + drv/hash_mb/sm3_mb_asimd_x4.S | 576 +++++++++++++++++++++++ + drv/hash_mb/sm3_mb_sve.S | 161 +++++++ + drv/hash_mb/sm3_sve_common.S | 505 ++++++++++++++++++++ + 11 files changed, 3958 insertions(+), 1 deletion(-) + create mode 100644 drv/hash_mb/hash_mb.c + create mode 100644 drv/hash_mb/hash_mb.h + create mode 100644 drv/hash_mb/md5_mb_asimd_x1.S + create mode 100644 drv/hash_mb/md5_mb_asimd_x4.S + create mode 100644 drv/hash_mb/md5_mb_sve.S + create mode 100644 drv/hash_mb/md5_sve_common.S + create mode 100644 drv/hash_mb/sm3_mb_asimd_x1.S + create mode 100644 drv/hash_mb/sm3_mb_asimd_x4.S + create mode 100644 drv/hash_mb/sm3_mb_sve.S + create mode 100644 drv/hash_mb/sm3_sve_common.S + +diff --git a/Makefile.am b/Makefile.am +index f78ad14..68f3106 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -45,7 +45,7 @@ lib_LTLIBRARIES=libwd.la libwd_comp.la libwd_crypto.la + + uadk_driversdir=$(libdir)/uadk + uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la \ +- libisa_ce.la ++ libisa_ce.la libisa_sve.la + + libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \ + v1/wd.c v1/wd.h v1/wd_adapter.c v1/wd_adapter.h \ +@@ -94,6 +94,12 @@ libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ + libisa_ce_la_SOURCES=arm_arch_ce.h drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S isa_ce_sm3.h \ + drv/isa_ce_sm4.c drv/isa_ce_sm4_armv8.S drv/isa_ce_sm4.h + ++libisa_sve_la_SOURCES=drv/hash_mb/hash_mb.c wd_digest_drv.h drv/hash_mb/hash_mb.h \ ++ drv/hash_mb/sm3_sve_common.S drv/hash_mb/sm3_mb_asimd_x1.S \ ++ drv/hash_mb/sm3_mb_asimd_x4.S drv/hash_mb/sm3_mb_sve.S \ ++ drv/hash_mb/md5_sve_common.S drv/hash_mb/md5_mb_asimd_x1.S \ ++ drv/hash_mb/md5_mb_asimd_x4.S drv/hash_mb/md5_mb_sve.S ++ + if WD_STATIC_DRV + AM_CFLAGS += -DWD_STATIC_DRV -fPIC + AM_CFLAGS += -DWD_NO_LOG +@@ -117,6 +123,9 @@ libhisi_hpre_la_DEPENDENCIES = libwd.la libwd_crypto.la + libisa_ce_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) + libisa_ce_la_DEPENDENCIES = libwd.la libwd_crypto.la + ++libisa_sve_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) ++libisa_sve_la_DEPENDENCIES = libwd.la libwd_crypto.la ++ + else + UADK_WD_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd.map + UADK_CRYPTO_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd_crypto.map +@@ -149,6 +158,10 @@ libhisi_hpre_la_DEPENDENCIES= libwd.la libwd_crypto.la + libisa_ce_la_LIBADD= -lwd -lwd_crypto + libisa_ce_la_LDFLAGS=$(UADK_VERSION) + libisa_ce_la_DEPENDENCIES= libwd.la libwd_crypto.la ++ ++libisa_sve_la_LIBADD= -lwd -lwd_crypto ++libisa_sve_la_LDFLAGS=$(UADK_VERSION) ++libisa_sve_la_DEPENDENCIES= libwd.la libwd_crypto.la + endif # WD_STATIC_DRV + + pkgconfigdir = $(libdir)/pkgconfig +diff --git a/drv/hash_mb/hash_mb.c b/drv/hash_mb/hash_mb.c +new file mode 100644 +index 0000000..a73c698 +--- /dev/null ++++ b/drv/hash_mb/hash_mb.c +@@ -0,0 +1,843 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */ ++ ++#include ++#include ++#include ++#include ++#include "hash_mb.h" ++ ++#define MIN(a, b) (((a) > (b)) ? (b) : (a)) ++#define IPAD_VALUE 0x36 ++#define OPAD_VALUE 0x5C ++#define HASH_KEY_LEN 64 ++#define HASH_BLOCK_OFFSET 6 ++#define HASH_BLOCK_SIZE 64 ++#define HASH_PADLENGTHFIELD_SIZE 56 ++#define HASH_PADDING_SIZE 120 ++#define HASH_HIGH_32BITS 32 ++#define HASH_PADDING_BLOCKS 2 ++#define HASH_NENO_PROCESS_JOBS 4 ++#define HASH_TRY_PROCESS_COUNT 16 ++#define BYTES_TO_BITS_OFFSET 3 ++ ++#define MD5_DIGEST_DATA_SIZE 16 ++#define SM3_DIGEST_DATA_SIZE 32 ++#define HASH_MAX_LANES 32 ++#define SM3_MAX_LANES 16 ++ ++#define PUTU32(p, V) \ ++ ((p)[0] = (uint8_t)((V) >> 24), \ ++ (p)[1] = (uint8_t)((V) >> 16), \ ++ (p)[2] = (uint8_t)((V) >> 8), \ ++ (p)[3] = (uint8_t)(V)) ++ ++struct hash_mb_ops { ++ int (*max_lanes)(void); ++ void (*asimd_x4)(struct hash_job *job1, struct hash_job *job2, ++ struct hash_job *job3, struct hash_job *job4, int len); ++ void (*asimd_x1)(struct hash_job *job, int len); ++ void (*sve)(int blocks, int total_lanes, struct hash_job **job_vec); ++ __u8 *iv_data; ++ int iv_bytes; ++ int max_jobs; ++}; ++ ++struct hash_mb_poll_queue { ++ struct hash_job *head; ++ struct hash_job *tail; ++ pthread_spinlock_t s_lock; ++ const struct hash_mb_ops *ops; ++ __u32 job_num; ++}; ++ ++struct hash_mb_queue { ++ struct hash_mb_poll_queue sm3_poll_queue; ++ struct hash_mb_poll_queue md5_poll_queue; ++ pthread_spinlock_t r_lock; ++ struct hash_job *recv_head; ++ struct hash_job *recv_tail; ++ __u32 complete_cnt; ++ __u8 ctx_mode; ++}; ++ ++struct hash_mb_ctx { ++ struct wd_ctx_config_internal config; ++}; ++ ++static __u8 sm3_iv_data[SM3_DIGEST_DATA_SIZE] = { ++ 0x73, 0x80, 0x16, 0x6f, 0x49, 0x14, 0xb2, 0xb9, ++ 0x17, 0x24, 0x42, 0xd7, 0xda, 0x8a, 0x06, 0x00, ++ 0xa9, 0x6f, 0x30, 0xbc, 0x16, 0x31, 0x38, 0xaa, ++ 0xe3, 0x8d, 0xee, 0x4d, 0xb0, 0xfb, 0x0e, 0x4e, ++}; ++ ++static __u8 md5_iv_data[MD5_DIGEST_DATA_SIZE] = { ++ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, ++ 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, ++}; ++ ++static struct hash_mb_ops md5_ops = { ++ .max_lanes = md5_mb_sve_max_lanes, ++ .asimd_x4 = md5_mb_asimd_x4, ++ .asimd_x1 = md5_mb_asimd_x1, ++ .sve = md5_mb_sve, ++ .iv_data = md5_iv_data, ++ .iv_bytes = MD5_DIGEST_DATA_SIZE, ++ .max_jobs = HASH_MAX_LANES, ++}; ++ ++static struct hash_mb_ops sm3_ops = { ++ .max_lanes = sm3_mb_sve_max_lanes, ++ .asimd_x4 = sm3_mb_asimd_x4, ++ .asimd_x1 = sm3_mb_asimd_x1, ++ .sve = sm3_mb_sve, ++ .iv_data = sm3_iv_data, ++ .iv_bytes = SM3_DIGEST_DATA_SIZE, ++ .max_jobs = SM3_MAX_LANES, ++}; ++ ++static void hash_mb_uninit_poll_queue(struct hash_mb_poll_queue *poll_queue) ++{ ++ pthread_spin_destroy(&poll_queue->s_lock); ++} ++ ++static void hash_mb_queue_uninit(struct wd_ctx_config_internal *config, int ctx_num) ++{ ++ struct hash_mb_queue *mb_queue; ++ struct wd_soft_ctx *ctx; ++ int i; ++ ++ for (i = 0; i < ctx_num; i++) { ++ ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx; ++ mb_queue = ctx->priv; ++ pthread_spin_destroy(&mb_queue->r_lock); ++ hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue); ++ hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue); ++ free(mb_queue); ++ } ++} ++ ++static int hash_mb_init_poll_queue(struct hash_mb_poll_queue *poll_queue) ++{ ++ int ret; ++ ++ ret = pthread_spin_init(&poll_queue->s_lock, PTHREAD_PROCESS_SHARED); ++ if (ret) { ++ WD_ERR("failed to init s_lock!\n"); ++ return ret; ++ } ++ ++ poll_queue->head = NULL; ++ poll_queue->tail = NULL; ++ poll_queue->job_num = 0; ++ ++ return WD_SUCCESS; ++} ++ ++static int hash_mb_queue_init(struct wd_ctx_config_internal *config) ++{ ++ struct hash_mb_queue *mb_queue; ++ int ctx_num = config->ctx_num; ++ struct wd_soft_ctx *ctx; ++ int i, ret; ++ ++ for (i = 0; i < ctx_num; i++) { ++ mb_queue = calloc(1, sizeof(struct hash_mb_queue)); ++ if (!mb_queue) { ++ ret = -WD_ENOMEM; ++ goto free_mb_queue; ++ } ++ ++ mb_queue->ctx_mode = config->ctxs[i].ctx_mode; ++ ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx; ++ ctx->priv = mb_queue; ++ ret = hash_mb_init_poll_queue(&mb_queue->sm3_poll_queue); ++ if (ret) ++ goto free_mem; ++ ++ ret = hash_mb_init_poll_queue(&mb_queue->md5_poll_queue); ++ if (ret) ++ goto uninit_sm3_poll; ++ ++ ret = pthread_spin_init(&mb_queue->r_lock, PTHREAD_PROCESS_SHARED); ++ if (ret) { ++ WD_ERR("failed to init r_lock!\n"); ++ goto uninit_md5_poll; ++ } ++ ++ mb_queue->sm3_poll_queue.ops = &sm3_ops; ++ mb_queue->md5_poll_queue.ops = &md5_ops; ++ mb_queue->recv_head = NULL; ++ mb_queue->recv_tail = NULL; ++ mb_queue->complete_cnt = 0; ++ } ++ ++ return WD_SUCCESS; ++ ++uninit_md5_poll: ++ hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue); ++uninit_sm3_poll: ++ hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue); ++free_mem: ++ free(mb_queue); ++free_mb_queue: ++ hash_mb_queue_uninit(config, i); ++ return ret; ++} ++ ++static int hash_mb_init(struct wd_alg_driver *drv, void *conf) ++{ ++ struct wd_ctx_config_internal *config = conf; ++ struct hash_mb_ctx *priv; ++ int ret; ++ ++ priv = malloc(sizeof(struct hash_mb_ctx)); ++ if (!priv) ++ return -WD_ENOMEM; ++ ++ /* multibuff does not use epoll. */ ++ config->epoll_en = 0; ++ memcpy(&priv->config, config, sizeof(struct wd_ctx_config_internal)); ++ ++ ret = hash_mb_queue_init(config); ++ if (ret) { ++ free(priv); ++ return ret; ++ } ++ ++ drv->priv = priv; ++ ++ return WD_SUCCESS; ++} ++ ++static void hash_mb_exit(struct wd_alg_driver *drv) ++{ ++ struct hash_mb_ctx *priv = (struct hash_mb_ctx *)drv->priv; ++ ++ if (!priv) ++ return; ++ ++ hash_mb_queue_uninit(&priv->config, priv->config.ctx_num); ++ free(priv); ++ drv->priv = NULL; ++} ++ ++static void hash_mb_pad_data(struct hash_pad *hash_pad, __u8 *in, __u32 partial, ++ __u64 total_len, bool transfer) ++{ ++ __u64 size = total_len << BYTES_TO_BITS_OFFSET; ++ __u8 *buffer = hash_pad->pad; ++ ++ if (partial) ++ memcpy(buffer, in, partial); ++ ++ buffer[partial++] = 0x80; ++ if (partial <= HASH_PADLENGTHFIELD_SIZE) { ++ memset(buffer + partial, 0, HASH_PADLENGTHFIELD_SIZE - partial); ++ if (transfer) { ++ PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE, size >> HASH_HIGH_32BITS); ++ PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE + sizeof(__u32), size); ++ } else { ++ memcpy(buffer + HASH_PADLENGTHFIELD_SIZE, &size, sizeof(__u64)); ++ } ++ hash_pad->pad_len = 1; ++ } else { ++ memset(buffer + partial, 0, HASH_PADDING_SIZE - partial); ++ if (transfer) { ++ PUTU32(buffer + HASH_PADDING_SIZE, size >> HASH_HIGH_32BITS); ++ PUTU32(buffer + HASH_PADDING_SIZE + sizeof(__u32), size); ++ } else { ++ memcpy(buffer + HASH_PADDING_SIZE, &size, sizeof(__u64)); ++ } ++ hash_pad->pad_len = HASH_PADDING_BLOCKS; ++ } ++} ++ ++static inline void hash_xor(__u8 *key_out, __u8 *key_in, __u32 key_len, __u8 xor_value) ++{ ++ __u32 i; ++ ++ for (i = 0; i < HASH_KEY_LEN; i++) { ++ if (i < key_len) ++ key_out[i] = key_in[i] ^ xor_value; ++ else ++ key_out[i] = xor_value; ++ } ++} ++ ++static int hash_middle_block_process(struct hash_mb_poll_queue *poll_queue, ++ struct wd_digest_msg *d_msg, ++ struct hash_job *job) ++{ ++ __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes; ++ __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes; ++ ++ if (length < HASH_BLOCK_SIZE) { ++ memcpy(buffer, d_msg->in, d_msg->in_bytes); ++ d_msg->partial_bytes = length; ++ return -WD_EAGAIN; ++ } ++ ++ if (d_msg->partial_bytes) { ++ memcpy(buffer, d_msg->in, HASH_BLOCK_SIZE - d_msg->partial_bytes); ++ job->buffer = d_msg->partial_block; ++ poll_queue->ops->asimd_x1(job, 1); ++ length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes); ++ buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes); ++ } else { ++ buffer = d_msg->in; ++ } ++ ++ job->len = length >> HASH_BLOCK_OFFSET; ++ d_msg->partial_bytes = length & (HASH_BLOCK_SIZE - 1); ++ if (d_msg->partial_bytes) ++ memcpy(d_msg->partial_block, buffer + (job->len << HASH_BLOCK_OFFSET), ++ d_msg->partial_bytes); ++ ++ if (!job->len) { ++ memcpy(d_msg->out, job->result_digest, poll_queue->ops->iv_bytes); ++ return -WD_EAGAIN; ++ } ++ ++ job->buffer = buffer; ++ job->pad.pad_len = 0; ++ ++ return WD_SUCCESS; ++} ++ ++static void hash_signle_block_process(struct wd_digest_msg *d_msg, ++ struct hash_job *job, __u64 total_len) ++{ ++ __u32 hash_partial = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1); ++ __u8 *buffer; ++ ++ job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET; ++ buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET); ++ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer); ++ if (!job->len) { ++ job->buffer = job->pad.pad; ++ job->len = job->pad.pad_len; ++ job->pad.pad_len = 0; ++ return; ++ } ++ ++ job->buffer = d_msg->in; ++} ++ ++static void hash_final_block_process(struct hash_mb_poll_queue *poll_queue, ++ struct wd_digest_msg *d_msg, ++ struct hash_job *job) ++{ ++ __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes; ++ __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes; ++ __u32 hash_partial = length & (HASH_BLOCK_SIZE - 1); ++ __u64 total_len = d_msg->long_data_len; ++ ++ if (job->opad.opad_size) ++ total_len += HASH_BLOCK_SIZE; ++ ++ if (!d_msg->partial_bytes) { ++ hash_signle_block_process(d_msg, job, total_len); ++ return; ++ } ++ ++ if (length <= HASH_BLOCK_SIZE) { ++ memcpy(buffer, d_msg->in, d_msg->in_bytes); ++ job->len = length >> HASH_BLOCK_OFFSET; ++ buffer = d_msg->partial_block + (job->len << HASH_BLOCK_OFFSET); ++ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer); ++ if (!job->len) { ++ job->buffer = job->pad.pad; ++ job->len = job->pad.pad_len; ++ job->pad.pad_len = 0; ++ return; ++ } ++ ++ job->buffer = d_msg->partial_block; ++ return; ++ } ++ ++ memcpy(buffer, d_msg->in, (HASH_BLOCK_SIZE - d_msg->partial_bytes)); ++ job->buffer = d_msg->partial_block; ++ poll_queue->ops->asimd_x1(job, 1); ++ job->buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes); ++ length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes); ++ job->len = length >> HASH_BLOCK_OFFSET; ++ buffer = job->buffer + (job->len << HASH_BLOCK_OFFSET); ++ hash_partial = length & (HASH_BLOCK_SIZE - 1); ++ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer); ++ if (!job->len) { ++ job->buffer = job->pad.pad; ++ job->len = job->pad.pad_len; ++ job->pad.pad_len = 0; ++ } ++} ++ ++static int hash_first_block_process(struct wd_digest_msg *d_msg, ++ struct hash_job *job, ++ __u32 iv_bytes) ++{ ++ __u8 *buffer; ++ ++ job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET; ++ d_msg->partial_bytes = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1); ++ if (d_msg->partial_bytes) { ++ buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET); ++ memcpy(d_msg->partial_block, buffer, d_msg->partial_bytes); ++ } ++ ++ /* ++ * Long hash mode, if first block is less than HASH_BLOCK_SIZE, ++ * copy ikey hash result to out. ++ */ ++ if (!job->len) { ++ memcpy(d_msg->out, job->result_digest, iv_bytes); ++ return -WD_EAGAIN; ++ } ++ job->buffer = d_msg->in; ++ job->pad.pad_len = 0; ++ ++ return WD_SUCCESS; ++} ++ ++static int hash_do_partial(struct hash_mb_poll_queue *poll_queue, ++ struct wd_digest_msg *d_msg, struct hash_job *job) ++{ ++ enum hash_block_type bd_type = get_hash_block_type(d_msg); ++ __u64 total_len = d_msg->in_bytes; ++ int ret = WD_SUCCESS; ++ ++ switch (bd_type) { ++ case HASH_FIRST_BLOCK: ++ ret = hash_first_block_process(d_msg, job, poll_queue->ops->iv_bytes); ++ break; ++ case HASH_MIDDLE_BLOCK: ++ ret = hash_middle_block_process(poll_queue, d_msg, job); ++ break; ++ case HASH_END_BLOCK: ++ hash_final_block_process(poll_queue, d_msg, job); ++ break; ++ case HASH_SINGLE_BLOCK: ++ if (job->opad.opad_size) ++ total_len += HASH_BLOCK_SIZE; ++ hash_signle_block_process(d_msg, job, total_len); ++ break; ++ } ++ ++ return ret; ++} ++ ++static void hash_mb_init_iv(struct hash_mb_poll_queue *poll_queue, ++ struct wd_digest_msg *d_msg, struct hash_job *job) ++{ ++ enum hash_block_type bd_type = get_hash_block_type(d_msg); ++ __u8 key_ipad[HASH_KEY_LEN]; ++ __u8 key_opad[HASH_KEY_LEN]; ++ ++ job->opad.opad_size = 0; ++ switch (bd_type) { ++ case HASH_FIRST_BLOCK: ++ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); ++ if (d_msg->mode != WD_DIGEST_HMAC) ++ return; ++ ++ hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE); ++ job->buffer = key_ipad; ++ poll_queue->ops->asimd_x1(job, 1); ++ break; ++ case HASH_MIDDLE_BLOCK: ++ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes); ++ break; ++ case HASH_END_BLOCK: ++ if (d_msg->mode != WD_DIGEST_HMAC) { ++ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes); ++ return; ++ } ++ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); ++ hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE); ++ job->buffer = key_opad; ++ poll_queue->ops->asimd_x1(job, 1); ++ memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes); ++ job->opad.opad_size = poll_queue->ops->iv_bytes; ++ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes); ++ break; ++ case HASH_SINGLE_BLOCK: ++ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); ++ if (d_msg->mode != WD_DIGEST_HMAC) ++ return; ++ ++ hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE); ++ hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE); ++ job->buffer = key_opad; ++ poll_queue->ops->asimd_x1(job, 1); ++ memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes); ++ job->opad.opad_size = poll_queue->ops->iv_bytes; ++ job->buffer = key_ipad; ++ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); ++ poll_queue->ops->asimd_x1(job, 1); ++ break; ++ } ++} ++ ++static void hash_do_sync(struct hash_mb_poll_queue *poll_queue, struct hash_job *job) ++{ ++ __u32 iv_bytes = poll_queue->ops->iv_bytes; ++ __u32 length; ++ ++ poll_queue->ops->asimd_x1(job, job->len); ++ ++ if (job->pad.pad_len) { ++ job->buffer = job->pad.pad; ++ poll_queue->ops->asimd_x1(job, job->pad.pad_len); ++ } ++ ++ if (job->opad.opad_size) { ++ job->buffer = job->opad.opad + job->opad.opad_size; ++ memcpy(job->buffer, job->result_digest, iv_bytes); ++ memcpy(job->result_digest, job->opad.opad, iv_bytes); ++ length = HASH_BLOCK_SIZE + iv_bytes; ++ hash_mb_pad_data(&job->pad, job->buffer, iv_bytes, length, job->is_transfer); ++ job->buffer = job->pad.pad; ++ poll_queue->ops->asimd_x1(job, job->pad.pad_len); ++ } ++} ++ ++static void hash_mb_add_job_tail(struct hash_mb_poll_queue *poll_queue, struct hash_job *job) ++{ ++ pthread_spin_lock(&poll_queue->s_lock); ++ if (poll_queue->job_num) { ++ poll_queue->tail->next = job; ++ poll_queue->tail = job; ++ } else { ++ poll_queue->head = job; ++ poll_queue->tail = job; ++ } ++ poll_queue->job_num++; ++ pthread_spin_unlock(&poll_queue->s_lock); ++} ++ ++static void hash_mb_add_job_head(struct hash_mb_poll_queue *poll_queue, struct hash_job *job) ++{ ++ pthread_spin_lock(&poll_queue->s_lock); ++ if (poll_queue->job_num) { ++ job->next = poll_queue->head; ++ poll_queue->head = job; ++ } else { ++ poll_queue->head = job; ++ poll_queue->tail = job; ++ } ++ poll_queue->job_num++; ++ pthread_spin_unlock(&poll_queue->s_lock); ++} ++ ++static int hash_mb_check_param(struct hash_mb_queue *mb_queue, struct wd_digest_msg *d_msg) ++{ ++ if (unlikely(mb_queue->ctx_mode == CTX_MODE_ASYNC && d_msg->has_next)) { ++ WD_ERR("invalid: async mode not supports long hash!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (unlikely(d_msg->data_fmt != WD_FLAT_BUF)) { ++ WD_ERR("invalid: hash multibuffer not supports sgl mode!\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++} ++ ++static int hash_mb_send(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg) ++{ ++ struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx; ++ struct hash_mb_queue *mb_queue = s_ctx->priv; ++ struct wd_digest_msg *d_msg = drv_msg; ++ struct hash_mb_poll_queue *poll_queue; ++ struct hash_job hash_sync_job; ++ struct hash_job *hash_job; ++ int ret; ++ ++ ret = hash_mb_check_param(mb_queue, d_msg); ++ if (ret) ++ return ret; ++ ++ if (mb_queue->ctx_mode == CTX_MODE_ASYNC) { ++ hash_job = malloc(sizeof(struct hash_job)); ++ if (unlikely(!hash_job)) ++ return -WD_ENOMEM; ++ } else { ++ hash_job = &hash_sync_job; ++ } ++ ++ switch (d_msg->alg) { ++ case WD_DIGEST_SM3: ++ poll_queue = &mb_queue->sm3_poll_queue; ++ hash_job->is_transfer = true; ++ break; ++ case WD_DIGEST_MD5: ++ poll_queue = &mb_queue->md5_poll_queue; ++ hash_job->is_transfer = false; ++ break; ++ default: ++ WD_ERR("invalid: alg type %u not support!\n", d_msg->alg); ++ if (mb_queue->ctx_mode == CTX_MODE_ASYNC) ++ free(hash_job); ++ return -WD_EINVAL; ++ } ++ ++ hash_mb_init_iv(poll_queue, d_msg, hash_job); ++ /* If block not need process, return directly. */ ++ ret = hash_do_partial(poll_queue, d_msg, hash_job); ++ if (ret == -WD_EAGAIN) { ++ if (mb_queue->ctx_mode == CTX_MODE_ASYNC) ++ free(hash_job); ++ ++ d_msg->result = WD_SUCCESS; ++ return WD_SUCCESS; ++ } ++ ++ if (mb_queue->ctx_mode == CTX_MODE_SYNC) { ++ hash_do_sync(poll_queue, hash_job); ++ memcpy(d_msg->out, hash_job->result_digest, d_msg->out_bytes); ++ d_msg->result = WD_SUCCESS; ++ return WD_SUCCESS; ++ } ++ ++ hash_job->msg = d_msg; ++ hash_mb_add_job_tail(poll_queue, hash_job); ++ ++ return WD_SUCCESS; ++} ++ ++static struct hash_job *hash_mb_find_complete_job(struct hash_mb_queue *mb_queue) ++{ ++ struct hash_job *job; ++ ++ pthread_spin_lock(&mb_queue->r_lock); ++ if (!mb_queue->complete_cnt) { ++ pthread_spin_unlock(&mb_queue->r_lock); ++ return NULL; ++ } ++ ++ job = mb_queue->recv_head; ++ mb_queue->recv_head = job->next; ++ mb_queue->complete_cnt--; ++ pthread_spin_unlock(&mb_queue->r_lock); ++ ++ return job; ++} ++ ++static int hash_recv_complete_job(struct hash_mb_queue *mb_queue, struct wd_digest_msg *msg) ++{ ++ struct hash_mb_poll_queue *poll_queue; ++ struct hash_job *hash_job; ++ __u32 total_len; ++ ++ hash_job = hash_mb_find_complete_job(mb_queue); ++ if (!hash_job) ++ return -WD_EAGAIN; ++ ++ if (!hash_job->opad.opad_size) { ++ msg->tag = hash_job->msg->tag; ++ memcpy(hash_job->msg->out, hash_job->result_digest, hash_job->msg->out_bytes); ++ free(hash_job); ++ msg->result = WD_SUCCESS; ++ return WD_SUCCESS; ++ } ++ ++ if (hash_job->msg->alg == WD_DIGEST_SM3) ++ poll_queue = &mb_queue->sm3_poll_queue; ++ else ++ poll_queue = &mb_queue->md5_poll_queue; ++ hash_job->buffer = hash_job->opad.opad + poll_queue->ops->iv_bytes; ++ memcpy(hash_job->buffer, hash_job->result_digest, poll_queue->ops->iv_bytes); ++ total_len = poll_queue->ops->iv_bytes + HASH_BLOCK_SIZE; ++ hash_mb_pad_data(&hash_job->pad, hash_job->buffer, poll_queue->ops->iv_bytes, ++ total_len, hash_job->is_transfer); ++ memcpy(hash_job->result_digest, hash_job->opad.opad, poll_queue->ops->iv_bytes); ++ hash_job->opad.opad_size = 0; ++ hash_job->buffer = hash_job->pad.pad; ++ hash_job->len = hash_job->pad.pad_len; ++ hash_job->pad.pad_len = 0; ++ ++ hash_mb_add_job_head(poll_queue, hash_job); ++ ++ return -WD_EAGAIN; ++} ++ ++static struct hash_job *hash_mb_get_job(struct hash_mb_poll_queue *poll_queue) ++{ ++ struct hash_job *job; ++ ++ pthread_spin_lock(&poll_queue->s_lock); ++ if (!poll_queue->job_num) { ++ pthread_spin_unlock(&poll_queue->s_lock); ++ return NULL; ++ } ++ ++ job = poll_queue->head; ++ poll_queue->head = job->next; ++ poll_queue->job_num--; ++ pthread_spin_unlock(&poll_queue->s_lock); ++ ++ return job; ++} ++ ++static void hash_mb_add_finish_job(struct hash_mb_queue *mb_queue, struct hash_job *job) ++{ ++ pthread_spin_lock(&mb_queue->r_lock); ++ if (mb_queue->complete_cnt) { ++ mb_queue->recv_tail->next = job; ++ mb_queue->recv_tail = job; ++ } else { ++ mb_queue->recv_head = job; ++ mb_queue->recv_tail = job; ++ } ++ mb_queue->complete_cnt++; ++ pthread_spin_unlock(&mb_queue->r_lock); ++} ++ ++static struct hash_mb_poll_queue *hash_get_poll_queue(struct hash_mb_queue *mb_queue) ++{ ++ if (!mb_queue->sm3_poll_queue.job_num && ++ !mb_queue->md5_poll_queue.job_num) ++ return NULL; ++ ++ if (mb_queue->md5_poll_queue.job_num >= mb_queue->sm3_poll_queue.job_num) ++ return &mb_queue->md5_poll_queue; ++ ++ return &mb_queue->sm3_poll_queue; ++} ++ ++static int hash_mb_do_jobs(struct hash_mb_queue *mb_queue) ++{ ++ struct hash_mb_poll_queue *poll_queue = hash_get_poll_queue(mb_queue); ++ struct hash_job *job_vecs[HASH_MAX_LANES]; ++ __u64 len = 0; ++ int maxjobs; ++ int j = 0; ++ int i = 0; ++ ++ if (!poll_queue) ++ return -WD_EAGAIN; ++ ++ maxjobs = poll_queue->ops->max_lanes(); ++ maxjobs = MIN(maxjobs, poll_queue->ops->max_jobs); ++ while (j < maxjobs) { ++ job_vecs[j] = hash_mb_get_job(poll_queue); ++ if (!job_vecs[j]) ++ break; ++ ++ if (!j) ++ len = job_vecs[j]->len; ++ else ++ len = MIN(job_vecs[j]->len, len); ++ j++; ++ } ++ ++ if (!j) ++ return -WD_EAGAIN; ++ ++ if (j > HASH_NENO_PROCESS_JOBS) { ++ poll_queue->ops->sve(len, j, job_vecs); ++ } else if (j == HASH_NENO_PROCESS_JOBS) { ++ poll_queue->ops->asimd_x4(job_vecs[0], job_vecs[1], ++ job_vecs[2], job_vecs[3], len); ++ } else { ++ while (i < j) ++ poll_queue->ops->asimd_x1(job_vecs[i++], len); ++ } ++ ++ for (i = 0; i < j; i++) { ++ if (job_vecs[i]->len == len) { ++ if (!job_vecs[i]->pad.pad_len) { ++ hash_mb_add_finish_job(mb_queue, job_vecs[i]); ++ } else { ++ job_vecs[i]->buffer = job_vecs[i]->pad.pad; ++ job_vecs[i]->len = job_vecs[i]->pad.pad_len; ++ job_vecs[i]->pad.pad_len = 0; ++ hash_mb_add_job_head(poll_queue, job_vecs[i]); ++ } ++ } else { ++ job_vecs[i]->len -= len; ++ job_vecs[i]->buffer += len << HASH_BLOCK_OFFSET; ++ hash_mb_add_job_head(poll_queue, job_vecs[i]); ++ } ++ } ++ ++ return WD_SUCCESS; ++} ++ ++static int hash_mb_recv(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg) ++{ ++ struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx; ++ struct hash_mb_queue *mb_queue = s_ctx->priv; ++ struct wd_digest_msg *msg = drv_msg; ++ int ret, i = 0; ++ ++ if (mb_queue->ctx_mode == CTX_MODE_SYNC) ++ return WD_SUCCESS; ++ ++ while (i++ < HASH_TRY_PROCESS_COUNT) { ++ ret = hash_recv_complete_job(mb_queue, msg); ++ if (!ret) ++ return WD_SUCCESS; ++ ++ ret = hash_mb_do_jobs(mb_queue); ++ if (ret) ++ return ret; ++ } ++ ++ return -WD_EAGAIN; ++} ++ ++static int hash_mb_get_usage(void *param) ++{ ++ return 0; ++} ++ ++#define GEN_HASH_ALG_DRIVER(hash_alg_name) \ ++{\ ++ .drv_name = "hash_mb",\ ++ .alg_name = (hash_alg_name),\ ++ .calc_type = UADK_ALG_SVE_INSTR,\ ++ .priority = 100,\ ++ .queue_num = 1,\ ++ .op_type_num = 1,\ ++ .fallback = 0,\ ++ .init = hash_mb_init,\ ++ .exit = hash_mb_exit,\ ++ .send = hash_mb_send,\ ++ .recv = hash_mb_recv,\ ++ .get_usage = hash_mb_get_usage,\ ++} ++ ++static struct wd_alg_driver hash_mb_driver[] = { ++ GEN_HASH_ALG_DRIVER("sm3"), ++ GEN_HASH_ALG_DRIVER("md5"), ++}; ++ ++static void __attribute__((constructor)) hash_mb_probe(void) ++{ ++ size_t alg_num = ARRAY_SIZE(hash_mb_driver); ++ size_t i; ++ int ret; ++ ++ WD_INFO("Info: register hash_mb alg drivers!\n"); ++ for (i = 0; i < alg_num; i++) { ++ ret = wd_alg_driver_register(&hash_mb_driver[i]); ++ if (ret && ret != -WD_ENODEV) ++ WD_ERR("Error: register hash multibuff %s failed!\n", ++ hash_mb_driver[i].alg_name); ++ } ++} ++ ++static void __attribute__((destructor)) hash_mb_remove(void) ++{ ++ size_t alg_num = ARRAY_SIZE(hash_mb_driver); ++ size_t i; ++ ++ WD_INFO("Info: unregister hash_mb alg drivers!\n"); ++ for (i = 0; i < alg_num; i++) ++ wd_alg_driver_unregister(&hash_mb_driver[i]); ++} ++ +diff --git a/drv/hash_mb/hash_mb.h b/drv/hash_mb/hash_mb.h +new file mode 100644 +index 0000000..aba5ec9 +--- /dev/null ++++ b/drv/hash_mb/hash_mb.h +@@ -0,0 +1,62 @@ ++/* SPDX-License-Identifier: Apache-2.0 */ ++/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */ ++ ++#ifndef __HASH_MB_H ++#define __HASH_MB_H ++ ++#include ++#include ++#include "drv/wd_digest_drv.h" ++#include "wd_digest.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define HASH_BLOCK_SIZE 64 ++#define HASH_DIGEST_NWORDS 32 ++ ++#if __STDC_VERSION__ >= 201112L ++# define __ALIGN_END __attribute__((aligned(64))) ++#else ++# define __ALIGN_END __aligned(64) ++#endif ++ ++struct hash_pad { ++ __u8 pad[HASH_BLOCK_SIZE * 2]; ++ __u32 pad_len; ++}; ++ ++struct hash_opad { ++ __u8 opad[HASH_BLOCK_SIZE]; ++ __u32 opad_size; ++}; ++ ++struct hash_job { ++ void *buffer; ++ __u64 len; ++ __u8 result_digest[HASH_DIGEST_NWORDS] __ALIGN_END; ++ struct hash_pad pad; ++ struct hash_opad opad; ++ struct hash_job *next; ++ struct wd_digest_msg *msg; ++ bool is_transfer; ++}; ++ ++void sm3_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec); ++void sm3_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2, ++ struct hash_job *job3, struct hash_job *job4, int len); ++void sm3_mb_asimd_x1(struct hash_job *job, int len); ++int sm3_mb_sve_max_lanes(void); ++void md5_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec); ++void md5_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2, ++ struct hash_job *job3, struct hash_job *job4, int len); ++void md5_mb_asimd_x1(struct hash_job *job, int len); ++int md5_mb_sve_max_lanes(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* __HASH_MB_H */ ++ +diff --git a/drv/hash_mb/md5_mb_asimd_x1.S b/drv/hash_mb/md5_mb_asimd_x1.S +new file mode 100644 +index 0000000..27d1124 +--- /dev/null ++++ b/drv/hash_mb/md5_mb_asimd_x1.S +@@ -0,0 +1,248 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ .arch armv8-a ++ ++/* ++Macros ++*/ ++ ++.macro declare_var_vector_reg name:req,reg:req ++ q_\name .req q\reg ++ v_\name .req v\reg ++ s_\name .req s\reg ++.endm ++ ++ ++.macro round_0_15 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req ++ eor tmp0,\d_c,\d_d ++ mov k,\kl ++ and tmp0,tmp0,\d_b ++ movk k,\kh,lsl 16 ++ eor tmp0,tmp0,\d_d ++ add tmp1,k,\w ++ add tmp0,tmp1,tmp0 ++ add tmp0,\d_a,tmp0 ++ ror tmp0,tmp0,32 - \r ++ add \d_a,\d_b,tmp0 ++.endm ++ ++.macro round_16_31 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req ++ eor tmp0,\d_b,\d_c ++ mov k,\kl ++ and tmp0,tmp0,\d_d ++ movk k,\kh,lsl 16 ++ eor tmp0,tmp0,\d_c ++ add tmp1,k,\w ++ add tmp0,tmp1,tmp0 ++ add tmp0,\d_a,tmp0 ++ ror tmp0,tmp0,32 - \r ++ add \d_a,\d_b,tmp0 ++.endm ++ ++.macro round_32_47 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req ++ eor tmp0,\d_b,\d_c ++ mov k,\kl ++ eor tmp0,tmp0,\d_d ++ movk k,\kh,lsl 16 ++ add tmp1,k,\w ++ add tmp0,tmp1,tmp0 ++ add tmp0,\d_a,tmp0 ++ ror tmp0,tmp0,32 - \r ++ add \d_a,\d_b,tmp0 ++.endm ++ ++.macro round_48_63 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req ++ orn tmp0,\d_b,\d_d ++ mov k,\kl ++ eor tmp0,tmp0,\d_c ++ movk k,\kh,lsl 16 ++ add tmp1,k,\w ++ add tmp0,tmp1,tmp0 ++ add tmp0,\d_a,tmp0 ++ ror tmp0,tmp0,32 - \r ++ add \d_a,\d_b,tmp0 ++.endm ++/* ++ variables ++*/ ++ job0 .req x0 ++ digest_addr .req x0 ++ len .req w1 ++ end .req x1 ++ ++ buf_adr .req x2 ++ d_a .req w3 ++ d_b .req w4 ++ d_c .req w5 ++ d_d .req w6 ++ k .req w7 ++ m0 .req w8 ++ m1 .req w9 ++ m2 .req w10 ++ m3 .req w11 ++ m4 .req w12 ++ m5 .req w13 ++ m6 .req w14 ++ m7 .req w15 ++ m8 .req w19 ++ m9 .req w20 ++ m10 .req w21 ++ m11 .req w22 ++ m12 .req w23 ++ m13 .req w24 ++ m14 .req w25 ++ m15 .req w26 ++ ++ tmp0 .req w27 ++ tmp1 .req w28 ++ ++ d_a1 .req w8 ++ d_b1 .req w9 ++ d_c1 .req w15 ++ d_d1 .req w19 ++ ++/* ++ void md5_mb_asimd_x1(MD5_JOB * job0,int len) ++*/ ++ .global md5_mb_asimd_x1 ++ .type md5_mb_asimd_x1, %function ++md5_mb_asimd_x1: ++ cmp len,0 ++ stp x29, x30, [sp,-96]! ++ ldr buf_adr,[job0],64 ++ stp x19, x20, [sp, 16] ++ add end,buf_adr,end,lsl 6 ++ stp x21, x22, [sp, 32] ++ ldp d_a,d_b,[digest_addr] ++ stp x23, x24, [sp, 48] ++ ldp d_c,d_d,[digest_addr,8] ++ stp x25, x26, [sp, 64] ++ stp x27, x28, [sp, 80] ++ ble .exit ++ ++.loop_start: ++ ldp m0,m1,[buf_adr],8 ++ ldp m2,m3,[buf_adr],8 ++ round_0_15 d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7 ++ ++ ldp m4,m5,[buf_adr],8 ++ round_0_15 d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12 ++ ldp m6,m7,[buf_adr],8 ++ round_0_15 d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17 ++ ldp m8,m9,[buf_adr],8 ++ round_0_15 d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22 ++ ldp m10,m11,[buf_adr],8 ++ round_0_15 d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7 ++ ldp m12,m13,[buf_adr],8 ++ round_0_15 d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12 ++ ldp m14,m15,[buf_adr],8 ++ round_0_15 d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17 ++ round_0_15 d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22 ++ round_0_15 d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7 ++ round_0_15 d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12 ++ round_0_15 d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17 ++ round_0_15 d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22 ++ round_0_15 d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7 ++ round_0_15 d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12 ++ round_0_15 d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17 ++ round_0_15 d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22 ++ ++ round_16_31 d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5 ++ round_16_31 d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9 ++ round_16_31 d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14 ++ round_16_31 d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20 ++ round_16_31 d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5 ++ round_16_31 d_d,d_a,d_b,d_c,0x244,0x1453,m10,9 ++ round_16_31 d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14 ++ round_16_31 d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20 ++ round_16_31 d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5 ++ round_16_31 d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9 ++ round_16_31 d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14 ++ round_16_31 d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20 ++ round_16_31 d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5 ++ round_16_31 d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9 ++ round_16_31 d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14 ++ round_16_31 d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20 ++ ++ round_32_47 d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4 ++ round_32_47 d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11 ++ round_32_47 d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16 ++ round_32_47 d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23 ++ round_32_47 d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4 ++ round_32_47 d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11 ++ round_32_47 d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16 ++ round_32_47 d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23 ++ round_32_47 d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4 ++ round_32_47 d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11 ++ round_32_47 d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16 ++ round_32_47 d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23 ++ round_32_47 d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4 ++ round_32_47 d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11 ++ round_32_47 d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16 ++ round_32_47 d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23 ++ ++ round_48_63 d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6 ++ round_48_63 d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10 ++ round_48_63 d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15 ++ round_48_63 d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21 ++ round_48_63 d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6 ++ round_48_63 d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10 ++ round_48_63 d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15 ++ round_48_63 d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21 ++ round_48_63 d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6 ++ round_48_63 d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10 ++ round_48_63 d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15 ++ round_48_63 d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21 ++ round_48_63 d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6 ++ ldp d_a1,d_b1,[digest_addr] ++ round_48_63 d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10 ++ ldp d_c1,d_d1,[digest_addr,8] ++ round_48_63 d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15 ++ round_48_63 d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21 ++ ++ cmp buf_adr,end ++ add d_a,d_a1 ,d_a ++ str d_a,[digest_addr] ++ add d_b,d_b1 ,d_b ++ str d_b,[digest_addr,4] ++ add d_c,d_c1 ,d_c ++ str d_c,[digest_addr,8] ++ add d_d,d_d1 ,d_d ++ str d_d,[digest_addr,12] ++ bne .loop_start ++ ++.exit: ++ ldp x19, x20, [sp, 16] ++ ldp x21, x22, [sp, 32] ++ ldp x23, x24, [sp, 48] ++ ldp x25, x26, [sp, 64] ++ ldp x27, x28, [sp, 80] ++ ldp x29, x30, [sp], 96 ++ ret ++ .size md5_mb_asimd_x1, .-md5_mb_asimd_x1 +diff --git a/drv/hash_mb/md5_mb_asimd_x4.S b/drv/hash_mb/md5_mb_asimd_x4.S +new file mode 100644 +index 0000000..5397913 +--- /dev/null ++++ b/drv/hash_mb/md5_mb_asimd_x4.S +@@ -0,0 +1,526 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ .arch armv8-a ++ ++/* ++Macros ++*/ ++ ++.macro declare_var_vector_reg name:req,reg:req ++ q_\name .req q\reg ++ v_\name .req v\reg ++ s_\name .req s\reg ++.endm ++ ++.macro add_key_rol a:req,b:req,k:req,w:req,r:req ++ add v_tmp0.4s,v_\k\().4s,v_\w\().4s ++ add v_tmp1.4s,v_tmp1.4s,v_\a\().4s ++ add v_tmp1.4s,v_tmp1.4s,v_tmp0.4s ++ shl v_tmp0.4s,v_tmp1.4s,\r ++ ushr v_tmp1.4s,v_tmp1.4s,32-\r ++ orr v_tmp0.16b,v_tmp1.16b,v_tmp0.16b ++ ++ add v_\a\().4s,v_\b\().4s,v_tmp0.4s ++.endm ++.macro round_0_15 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req ++ mov v_tmp1.16b, v_\b\().16b ++ bsl v_tmp1.16b, v_\c\().16b, v_\d\().16b ++ ldr q_\k1,[key_adr],16 ++ add_key_rol \a,\b,\k,\w,\r ++.endm ++ ++.macro round_16_31 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req ++ mov v_tmp1.16b, v_\d\().16b ++ bsl v_tmp1.16b, v_\b\().16b, v_\c\().16b ++ ldr q_\k1,[key_adr],16 ++ add_key_rol \a,\b,\k,\w,\r ++.endm ++ ++.macro round_32_47 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req ++ eor v_tmp1.16b,v_\b\().16b,v_\c\().16b ++ eor v_tmp1.16b,v_tmp1.16b,v_\d\().16b ++ ldr q_\k1,[key_adr],16 ++ add_key_rol \a,\b,\k,\w,\r ++.endm ++ ++.macro round_48_63 a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req ++ orn v_tmp1.16b,v_\b\().16b,v_\d\().16b ++ eor v_tmp1.16b,v_tmp1.16b,v_\c\().16b ++ .ifnb \k1 ++ ldr q_\k1,[key_adr],16 ++ .endif ++ add_key_rol \a,\b,\k,\w,\r ++.endm ++/* ++ variables ++*/ ++ declare_var_vector_reg tmp0, 0 ++ declare_var_vector_reg tmp1, 1 ++ declare_var_vector_reg k, 2 ++ declare_var_vector_reg k1, 3 ++ declare_var_vector_reg a, 4 ++ declare_var_vector_reg b, 5 ++ declare_var_vector_reg c, 6 ++ declare_var_vector_reg d, 7 ++ declare_var_vector_reg a1, 8 ++ declare_var_vector_reg b1, 9 ++ declare_var_vector_reg c1, 10 ++ declare_var_vector_reg d1, 11 ++ ++ declare_var_vector_reg w0, 16 ++ declare_var_vector_reg w1, 17 ++ declare_var_vector_reg w2, 18 ++ declare_var_vector_reg w3, 19 ++ declare_var_vector_reg w4, 20 ++ declare_var_vector_reg w5, 21 ++ declare_var_vector_reg w6, 22 ++ declare_var_vector_reg w7, 23 ++ declare_var_vector_reg w8, 24 ++ declare_var_vector_reg w9, 25 ++ declare_var_vector_reg w10, 26 ++ declare_var_vector_reg w11, 27 ++ declare_var_vector_reg w12, 28 ++ declare_var_vector_reg w13, 29 ++ declare_var_vector_reg w14, 30 ++ declare_var_vector_reg w15, 31 ++ ++ len .req w4 ++ len_x .req x4 ++ lane0 .req x5 ++ lane1 .req x6 ++ lane2 .req x7 ++ lane3 .req x9 ++ end .req x4 ++ job0 .req x0 ++ job1 .req x1 ++ job2 .req x2 ++ job3 .req x3 ++ key_adr .req x10 ++ ++/* ++ void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1, ++ MD5_JOB * job2, MD5_JOB * job3, int len) ++*/ ++ .global md5_mb_asimd_x4 ++ .type md5_mb_asimd_x4, %function ++md5_mb_asimd_x4: ++ stp x29,x30,[sp,-48]! ++ ldr lane0,[job0],64 ++ stp d8,d9,[sp,16] ++ ldr lane1,[job1],64 ++ stp d10,d11,[sp,32] ++ ldr lane2,[job2],64 ++ cmp len,0 ++ ldr lane3,[job3],64 ++ ble .exit ++ ++ //load digests ++ ld4 {v_a.s-v_d.s}[0],[job0] ++ add end,lane0,len_x,lsl 6 ++ ld4 {v_a.s-v_d.s}[1],[job1] ++ ld4 {v_a.s-v_d.s}[2],[job2] ++ ld4 {v_a.s-v_d.s}[3],[job3] ++.loop_start: ++ ld1 {v_w0.s}[0],[lane0],4 ++ mov v_a1.16b,v_a.16b ++ ld1 {v_w0.s}[1],[lane1],4 ++ mov v_b1.16b,v_b.16b ++ ld1 {v_w0.s}[2],[lane2],4 ++ mov v_c1.16b,v_c.16b ++ ld1 {v_w0.s}[3],[lane3],4 ++ mov v_d1.16b,v_d.16b ++ ++ ld3 {v_w1.s-v_w3.s}[0],[lane0],12 ++ adrp key_adr,.key_consts ++ ld3 {v_w1.s-v_w3.s}[1],[lane1],12 ++ add key_adr,key_adr,#:lo12:.key_consts ++ ld3 {v_w1.s-v_w3.s}[2],[lane2],12 ++ ldr q_k,[key_adr],16 ++ ld3 {v_w1.s-v_w3.s}[3],[lane3],12 ++ ++ ++ ld4 {v_w4.s-v_w7.s}[0], [lane0],16 ++ ++ round_0_15 a,b,c,d,k,k1,w0,7 ++ ++ ld4 {v_w4.s-v_w7.s}[1], [lane1],16 ++ round_0_15 d,a,b,c,k1,k,w1,12 ++ ld4 {v_w4.s-v_w7.s}[2], [lane2],16 ++ round_0_15 c,d,a,b,k,k1,w2,17 ++ ld4 {v_w4.s-v_w7.s}[3], [lane3],16 ++ round_0_15 b,c,d,a,k1,k,w3,22 ++ ld4 {v_w8.s-v_w11.s}[0],[lane0],16 ++ round_0_15 a,b,c,d,k,k1,w4,7 ++ ld4 {v_w8.s-v_w11.s}[1],[lane1],16 ++ round_0_15 d,a,b,c,k1,k,w5,12 ++ ld4 {v_w8.s-v_w11.s}[2],[lane2],16 ++ round_0_15 c,d,a,b,k,k1,w6,17 ++ ld4 {v_w8.s-v_w11.s}[3],[lane3],16 ++ round_0_15 b,c,d,a,k1,k,w7,22 ++ ld4 {v_w12.s-v_w15.s}[0],[lane0],16 ++ round_0_15 a,b,c,d,k,k1,w8,7 ++ ld4 {v_w12.s-v_w15.s}[1],[lane1],16 ++ round_0_15 d,a,b,c,k1,k,w9,12 ++ ld4 {v_w12.s-v_w15.s}[2],[lane2],16 ++ round_0_15 c,d,a,b,k,k1,w10,17 ++ ld4 {v_w12.s-v_w15.s}[3],[lane3],16 ++ round_0_15 b,c,d,a,k1,k,w11,22 ++ round_0_15 a,b,c,d,k,k1,w12,7 ++ round_0_15 d,a,b,c,k1,k,w13,12 ++ round_0_15 c,d,a,b,k,k1,w14,17 ++ round_0_15 b,c,d,a,k1,k,w15,22 ++ ++ round_16_31 a,b,c,d,k,k1,w1,5 ++ round_16_31 d,a,b,c,k1,k,w6,9 ++ round_16_31 c,d,a,b,k,k1,w11,14 ++ round_16_31 b,c,d,a,k1,k,w0,20 ++ round_16_31 a,b,c,d,k,k1,w5,5 ++ round_16_31 d,a,b,c,k1,k,w10,9 ++ round_16_31 c,d,a,b,k,k1,w15,14 ++ round_16_31 b,c,d,a,k1,k,w4,20 ++ round_16_31 a,b,c,d,k,k1,w9,5 ++ round_16_31 d,a,b,c,k1,k,w14,9 ++ round_16_31 c,d,a,b,k,k1,w3,14 ++ round_16_31 b,c,d,a,k1,k,w8,20 ++ round_16_31 a,b,c,d,k,k1,w13,5 ++ round_16_31 d,a,b,c,k1,k,w2,9 ++ round_16_31 c,d,a,b,k,k1,w7,14 ++ round_16_31 b,c,d,a,k1,k,w12,20 ++ ++ round_32_47 a,b,c,d,k,k1,w5,4 ++ round_32_47 d,a,b,c,k1,k,w8,11 ++ round_32_47 c,d,a,b,k,k1,w11,16 ++ round_32_47 b,c,d,a,k1,k,w14,23 ++ round_32_47 a,b,c,d,k,k1,w1,4 ++ round_32_47 d,a,b,c,k1,k,w4,11 ++ round_32_47 c,d,a,b,k,k1,w7,16 ++ round_32_47 b,c,d,a,k1,k,w10,23 ++ round_32_47 a,b,c,d,k,k1,w13,4 ++ round_32_47 d,a,b,c,k1,k,w0,11 ++ round_32_47 c,d,a,b,k,k1,w3,16 ++ round_32_47 b,c,d,a,k1,k,w6,23 ++ round_32_47 a,b,c,d,k,k1,w9,4 ++ round_32_47 d,a,b,c,k1,k,w12,11 ++ round_32_47 c,d,a,b,k,k1,w15,16 ++ round_32_47 b,c,d,a,k1,k,w2,23 ++ ++ round_48_63 a,b,c,d,k,k1,w0,6 ++ round_48_63 d,a,b,c,k1,k,w7,10 ++ round_48_63 c,d,a,b,k,k1,w14,15 ++ round_48_63 b,c,d,a,k1,k,w5,21 ++ round_48_63 a,b,c,d,k,k1,w12,6 ++ round_48_63 d,a,b,c,k1,k,w3,10 ++ round_48_63 c,d,a,b,k,k1,w10,15 ++ round_48_63 b,c,d,a,k1,k,w1,21 ++ round_48_63 a,b,c,d,k,k1,w8,6 ++ round_48_63 d,a,b,c,k1,k,w15,10 ++ round_48_63 c,d,a,b,k,k1,w6,15 ++ round_48_63 b,c,d,a,k1,k,w13,21 ++ round_48_63 a,b,c,d,k,k1,w4,6 ++ round_48_63 d,a,b,c,k1,k,w11,10 ++ round_48_63 c,d,a,b,k,k1,w2,15 ++ round_48_63 b,c,d,a,k1, ,w9,21 ++ ++ ++ ++ ++ cmp lane0,end ++ add v_a.4s,v_a1.4s,v_a.4s ++ add v_b.4s,v_b1.4s,v_b.4s ++ add v_c.4s,v_c1.4s,v_c.4s ++ add v_d.4s,v_d1.4s,v_d.4s ++ bne .loop_start ++ ++ st4 {v_a.s-v_d.s}[0],[job0] ++ st4 {v_a.s-v_d.s}[1],[job1] ++ st4 {v_a.s-v_d.s}[2],[job2] ++ st4 {v_a.s-v_d.s}[3],[job3] ++.exit: ++ ldp d8,d9,[sp,16] ++ ldp d10,d11,[sp,32] ++ ldp x29,x30,[sp],48 ++ ret ++.key_consts: ++ .word 0xd76aa478 ++ .word 0xd76aa478 ++ .word 0xd76aa478 ++ .word 0xd76aa478 ++ .word 0xe8c7b756 ++ .word 0xe8c7b756 ++ .word 0xe8c7b756 ++ .word 0xe8c7b756 ++ .word 0x242070db ++ .word 0x242070db ++ .word 0x242070db ++ .word 0x242070db ++ .word 0xc1bdceee ++ .word 0xc1bdceee ++ .word 0xc1bdceee ++ .word 0xc1bdceee ++ .word 0xf57c0faf ++ .word 0xf57c0faf ++ .word 0xf57c0faf ++ .word 0xf57c0faf ++ .word 0x4787c62a ++ .word 0x4787c62a ++ .word 0x4787c62a ++ .word 0x4787c62a ++ .word 0xa8304613 ++ .word 0xa8304613 ++ .word 0xa8304613 ++ .word 0xa8304613 ++ .word 0xfd469501 ++ .word 0xfd469501 ++ .word 0xfd469501 ++ .word 0xfd469501 ++ .word 0x698098d8 ++ .word 0x698098d8 ++ .word 0x698098d8 ++ .word 0x698098d8 ++ .word 0x8b44f7af ++ .word 0x8b44f7af ++ .word 0x8b44f7af ++ .word 0x8b44f7af ++ .word 0xffff5bb1 ++ .word 0xffff5bb1 ++ .word 0xffff5bb1 ++ .word 0xffff5bb1 ++ .word 0x895cd7be ++ .word 0x895cd7be ++ .word 0x895cd7be ++ .word 0x895cd7be ++ .word 0x6b901122 ++ .word 0x6b901122 ++ .word 0x6b901122 ++ .word 0x6b901122 ++ .word 0xfd987193 ++ .word 0xfd987193 ++ .word 0xfd987193 ++ .word 0xfd987193 ++ .word 0xa679438e ++ .word 0xa679438e ++ .word 0xa679438e ++ .word 0xa679438e ++ .word 0x49b40821 ++ .word 0x49b40821 ++ .word 0x49b40821 ++ .word 0x49b40821 ++ .word 0xf61e2562 ++ .word 0xf61e2562 ++ .word 0xf61e2562 ++ .word 0xf61e2562 ++ .word 0xc040b340 ++ .word 0xc040b340 ++ .word 0xc040b340 ++ .word 0xc040b340 ++ .word 0x265e5a51 ++ .word 0x265e5a51 ++ .word 0x265e5a51 ++ .word 0x265e5a51 ++ .word 0xe9b6c7aa ++ .word 0xe9b6c7aa ++ .word 0xe9b6c7aa ++ .word 0xe9b6c7aa ++ .word 0xd62f105d ++ .word 0xd62f105d ++ .word 0xd62f105d ++ .word 0xd62f105d ++ .word 0x02441453 ++ .word 0x02441453 ++ .word 0x02441453 ++ .word 0x02441453 ++ .word 0xd8a1e681 ++ .word 0xd8a1e681 ++ .word 0xd8a1e681 ++ .word 0xd8a1e681 ++ .word 0xe7d3fbc8 ++ .word 0xe7d3fbc8 ++ .word 0xe7d3fbc8 ++ .word 0xe7d3fbc8 ++ .word 0x21e1cde6 ++ .word 0x21e1cde6 ++ .word 0x21e1cde6 ++ .word 0x21e1cde6 ++ .word 0xc33707d6 ++ .word 0xc33707d6 ++ .word 0xc33707d6 ++ .word 0xc33707d6 ++ .word 0xf4d50d87 ++ .word 0xf4d50d87 ++ .word 0xf4d50d87 ++ .word 0xf4d50d87 ++ .word 0x455a14ed ++ .word 0x455a14ed ++ .word 0x455a14ed ++ .word 0x455a14ed ++ .word 0xa9e3e905 ++ .word 0xa9e3e905 ++ .word 0xa9e3e905 ++ .word 0xa9e3e905 ++ .word 0xfcefa3f8 ++ .word 0xfcefa3f8 ++ .word 0xfcefa3f8 ++ .word 0xfcefa3f8 ++ .word 0x676f02d9 ++ .word 0x676f02d9 ++ .word 0x676f02d9 ++ .word 0x676f02d9 ++ .word 0x8d2a4c8a ++ .word 0x8d2a4c8a ++ .word 0x8d2a4c8a ++ .word 0x8d2a4c8a ++ .word 0xfffa3942 ++ .word 0xfffa3942 ++ .word 0xfffa3942 ++ .word 0xfffa3942 ++ .word 0x8771f681 ++ .word 0x8771f681 ++ .word 0x8771f681 ++ .word 0x8771f681 ++ .word 0x6d9d6122 ++ .word 0x6d9d6122 ++ .word 0x6d9d6122 ++ .word 0x6d9d6122 ++ .word 0xfde5380c ++ .word 0xfde5380c ++ .word 0xfde5380c ++ .word 0xfde5380c ++ .word 0xa4beea44 ++ .word 0xa4beea44 ++ .word 0xa4beea44 ++ .word 0xa4beea44 ++ .word 0x4bdecfa9 ++ .word 0x4bdecfa9 ++ .word 0x4bdecfa9 ++ .word 0x4bdecfa9 ++ .word 0xf6bb4b60 ++ .word 0xf6bb4b60 ++ .word 0xf6bb4b60 ++ .word 0xf6bb4b60 ++ .word 0xbebfbc70 ++ .word 0xbebfbc70 ++ .word 0xbebfbc70 ++ .word 0xbebfbc70 ++ .word 0x289b7ec6 ++ .word 0x289b7ec6 ++ .word 0x289b7ec6 ++ .word 0x289b7ec6 ++ .word 0xeaa127fa ++ .word 0xeaa127fa ++ .word 0xeaa127fa ++ .word 0xeaa127fa ++ .word 0xd4ef3085 ++ .word 0xd4ef3085 ++ .word 0xd4ef3085 ++ .word 0xd4ef3085 ++ .word 0x04881d05 ++ .word 0x04881d05 ++ .word 0x04881d05 ++ .word 0x04881d05 ++ .word 0xd9d4d039 ++ .word 0xd9d4d039 ++ .word 0xd9d4d039 ++ .word 0xd9d4d039 ++ .word 0xe6db99e5 ++ .word 0xe6db99e5 ++ .word 0xe6db99e5 ++ .word 0xe6db99e5 ++ .word 0x1fa27cf8 ++ .word 0x1fa27cf8 ++ .word 0x1fa27cf8 ++ .word 0x1fa27cf8 ++ .word 0xc4ac5665 ++ .word 0xc4ac5665 ++ .word 0xc4ac5665 ++ .word 0xc4ac5665 ++ .word 0xf4292244 ++ .word 0xf4292244 ++ .word 0xf4292244 ++ .word 0xf4292244 ++ .word 0x432aff97 ++ .word 0x432aff97 ++ .word 0x432aff97 ++ .word 0x432aff97 ++ .word 0xab9423a7 ++ .word 0xab9423a7 ++ .word 0xab9423a7 ++ .word 0xab9423a7 ++ .word 0xfc93a039 ++ .word 0xfc93a039 ++ .word 0xfc93a039 ++ .word 0xfc93a039 ++ .word 0x655b59c3 ++ .word 0x655b59c3 ++ .word 0x655b59c3 ++ .word 0x655b59c3 ++ .word 0x8f0ccc92 ++ .word 0x8f0ccc92 ++ .word 0x8f0ccc92 ++ .word 0x8f0ccc92 ++ .word 0xffeff47d ++ .word 0xffeff47d ++ .word 0xffeff47d ++ .word 0xffeff47d ++ .word 0x85845dd1 ++ .word 0x85845dd1 ++ .word 0x85845dd1 ++ .word 0x85845dd1 ++ .word 0x6fa87e4f ++ .word 0x6fa87e4f ++ .word 0x6fa87e4f ++ .word 0x6fa87e4f ++ .word 0xfe2ce6e0 ++ .word 0xfe2ce6e0 ++ .word 0xfe2ce6e0 ++ .word 0xfe2ce6e0 ++ .word 0xa3014314 ++ .word 0xa3014314 ++ .word 0xa3014314 ++ .word 0xa3014314 ++ .word 0x4e0811a1 ++ .word 0x4e0811a1 ++ .word 0x4e0811a1 ++ .word 0x4e0811a1 ++ .word 0xf7537e82 ++ .word 0xf7537e82 ++ .word 0xf7537e82 ++ .word 0xf7537e82 ++ .word 0xbd3af235 ++ .word 0xbd3af235 ++ .word 0xbd3af235 ++ .word 0xbd3af235 ++ .word 0x2ad7d2bb ++ .word 0x2ad7d2bb ++ .word 0x2ad7d2bb ++ .word 0x2ad7d2bb ++ .word 0xeb86d391 ++ .word 0xeb86d391 ++ .word 0xeb86d391 ++ .word 0xeb86d391 ++ .size md5_mb_asimd_x4, .-md5_mb_asimd_x4 +diff --git a/drv/hash_mb/md5_mb_sve.S b/drv/hash_mb/md5_mb_sve.S +new file mode 100644 +index 0000000..8d8ecc1 +--- /dev/null ++++ b/drv/hash_mb/md5_mb_sve.S +@@ -0,0 +1,158 @@ ++/********************************************************************** ++ Copyright(c) 2022 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ ++ .arch armv8.2-a+sve ++ ++// copying data from sparse memory unto continuous stack space ++// in oroder to gather-load into SVE registers ++.macro copy_mb_16words vecs:req,dest:req ++ mov src,\vecs ++ mov dst,\dest ++ mov counter,total_lanes ++10: ++ ldr tmp,[src],8 ++ ldr tmp,[tmp] ++ add tmp,tmp,block_ctr,lsl 6 ++ ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp] ++ st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64 ++ subs counter,counter,1 ++ b.ne 10b ++.endm ++ ++.macro load_init ++ mov tmpw,16 ++ index VOFFS.s,0,tmpw ++ copy_mb_16words job_vec,databuf ++.endm ++ ++.macro load_word pipelines:req,windex:req,zreg0:req,zreg1 ++ add tmp,databuf,\windex * 4 ++ ld1w { \zreg0\().s}, p0/z, [tmp, VOFFS.s, UXTW 2] ++ .if \pipelines > 1 ++ add tmp,tmp,veclen,lsl #6 ++ ld1w {\zreg1\().s}, p1/z, [tmp, VOFFS.s, UXTW 2] ++ .endif ++.endm ++ ++#include "md5_sve_common.S" ++ ++/* int md5_mb_sve_max_lanes() ++ */ ++ .global md5_mb_sve_max_lanes ++ .type md5_mb_sve_max_lanes, %function ++md5_mb_sve_max_lanes: ++ cntw x0 ++ add x0,x0,x0 ++ ret ++ .size md5_mb_sve_max_lanes, .-md5_mb_sve_max_lanes ++ ++/* ++ * void md5_mb_sve(int blocks, int total_lanes, MD5_JOB **job_vec) ++ */ ++ num_blocks .req w0 ++ total_lanes .req w1 ++ job_vec .req x2 ++ src .req x5 ++ dst .req x6 ++ tmp .req x8 ++ tmpw .req w8 ++ block_ctr .req x9 ++ block_ctr_w .req w9 ++ savedsp .req x10 ++ databuf .req x11 ++ counter .req w12 ++ veclen .req x13 ++ veclen_w .req w13 ++ abcd_buf .req x14 ++ md5key_adr .req x15 ++ ++ .global md5_mb_sve ++ .type md5_mb_sve, %function ++md5_mb_sve: ++ cbz num_blocks,.return ++ md5_sve_save_stack ++ mov savedsp,sp ++ // reserve (16 * lanes) for abcd buf ++ mov tmpw,total_lanes,lsl 4 ++ sub abcd_buf,sp,tmp ++ // reserve (64 * lanes) for data buf ++ mov tmpw,total_lanes,lsl 6 ++ sub databuf,abcd_buf,tmp ++ mov sp,databuf ++ adr md5key_adr,MD5_CONST_KEYS ++ whilelo p0.s,wzr,total_lanes ++ mov src,job_vec ++ mov dst,abcd_buf ++ mov counter,total_lanes ++.ldr_hash: ++ ldr tmp,[src],8 ++ add tmp,tmp,64 ++ ld1 {v0.16b},[tmp] ++ st1 {v0.16b},[dst],16 ++ subs counter,counter,1 ++ bne .ldr_hash ++ ld4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0/z,[abcd_buf] ++ mov block_ctr,0 ++ cntp veclen,p0,p0.s ++ cmp veclen_w,total_lanes ++ b.eq .loop_1x ++ whilelo p1.s,veclen_w,total_lanes ++ add tmp,abcd_buf,veclen,lsl #4 ++ ld4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1/z,[tmp] ++ b .loop_2x ++.loop_1x: ++ md5_single 1 ++ add block_ctr, block_ctr, 1 ++ cmp block_ctr_w,num_blocks ++ bne .loop_1x ++ st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf] ++ b 1f ++.loop_2x: ++ md5_single 2 ++ add block_ctr, block_ctr, 1 ++ cmp block_ctr_w,num_blocks ++ bne .loop_2x ++ st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf] ++ add tmp,abcd_buf,veclen,lsl #4 ++ st4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1,[tmp] ++1: ++ mov dst,job_vec ++ mov src,abcd_buf ++.str_hash: ++ ld1 {v0.16b},[src],16 ++ ldr tmp,[dst],8 ++ add tmp,tmp,64 ++ st1 {v0.16b},[tmp] ++ subs total_lanes,total_lanes,1 ++ bne .str_hash ++ mov sp,savedsp ++ md5_sve_restore_stack ++.return: ++ ret ++ .size md5_mb_sve, .-md5_mb_sve +diff --git a/drv/hash_mb/md5_sve_common.S b/drv/hash_mb/md5_sve_common.S +new file mode 100644 +index 0000000..ed81482 +--- /dev/null ++++ b/drv/hash_mb/md5_sve_common.S +@@ -0,0 +1,478 @@ ++/********************************************************************** ++ Copyright(c) 2022 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ VK .req z0 ++ VOFFS .req z1 ++ VA_0 .req z2 ++ VB_0 .req z3 ++ VC_0 .req z4 ++ VD_0 .req z5 ++ VF_0 .req z6 ++ VF_1 .req z7 ++ VA_1 .req z16 ++ VB_1 .req z17 ++ VC_1 .req z18 ++ VD_1 .req z19 ++ MD5WORD0_0 .req z20 ++ MD5WORD1_0 .req z21 ++ MD5WORD0_1 .req z22 ++ MD5WORD1_1 .req z23 ++ TMPV0 .req v20 ++ TMPV1 .req v21 ++ TMPV2 .req v22 ++ TMPV3 .req v23 ++ VTMP_0 .req z24 ++ VAA_0 .req z25 ++ VBB_0 .req z26 ++ VCC_0 .req z27 ++ VDD_0 .req z28 ++ VTMP_1 .req z29 ++ VAA_1 .req z30 ++ VBB_1 .req z31 ++ VCC_1 .req z8 ++ VDD_1 .req z9 ++ TT .req z0 ++ ++.macro rotate_left_x1 out:req,in:req,tmp:req,bits ++ .if \bits == 16 ++ revh \out\().s,p0/m,\in\().s ++ .else ++ .if have_sve2 == 0 ++ lsl \tmp\().s, \in\().s,\bits ++ lsr \out\().s,\in\().s,32-\bits ++ orr \out\().d,\out\().d,\tmp\().d ++ .else ++ movprfx \out\().d,\in\().d ++ xar \out\().s,\out\().s,VZERO.s,32-\bits ++ .endif ++ .endif ++.endm ++ ++.macro rotate_left_x2 out:req,in:req,tmp:req,bits,out1:req,in1:req,tmp1:req,bits1 ++ ++ .if \bits == 16 ++ revh \out\().s,p0/m,\in\().s ++ revh \out1\().s,p0/m,\in1\().s ++ .else ++ .if have_sve2 == 0 ++ lsl \tmp\().s, \in\().s,\bits ++ lsl \tmp1\().s, \in1\().s,\bits1 ++ lsr \out\().s,\in\().s,32-\bits ++ lsr \out1\().s,\in1\().s,32-\bits1 ++ orr \out\().d,\out\().d,\tmp\().d ++ orr \out1\().d,\out1\().d,\tmp1\().d ++ .else ++ movprfx \out\().d,\in\().d ++ xar \out\().s,\out\().s,VZERO.s,32-\bits ++ movprfx \out1\().d,\in1\().d ++ xar \out1\().s,\out1\().s,VZERO.s,32-\bits1 ++ .endif ++ .endif ++.endm ++ ++.macro bsl_x1 ret:req,x:req,y:req,z:req,tmp:req ++ .if have_sve2 == 0 ++ bic \ret\().d,\z\().d,\x\().d ++ and \tmp\().d,\x\().d,\y\().d ++ orr \ret\().d,\ret\().d,\tmp\().d ++ .else ++ movprfx \ret\().d,\x\().d ++ bsl \ret\().d,\ret\().d,\y\().d,\z\().d ++ .endif ++.endm ++ ++.macro bsl_x2 ret:req,x:req,y:req,z:req,tmp:req,ret1:req,x1:req,y1:req,z1:req,tmp1:req ++ .if have_sve2 == 0 ++ bic \ret\().d,\z\().d,\x\().d ++ bic \ret1\().d,\z1\().d,\x1\().d ++ and \tmp\().d,\x\().d,\y\().d ++ and \tmp1\().d,\x1\().d,\y1\().d ++ orr \ret\().d,\ret\().d,\tmp\().d ++ orr \ret1\().d,\ret1\().d,\tmp1\().d ++ .else ++ movprfx \ret\().d,\x\().d ++ bsl \ret\().d,\ret\().d,\y\().d,\z\().d ++ movprfx \ret1\().d,\x1\().d ++ bsl \ret1\().d,\ret1\().d,\y1\().d,\z1\().d ++ .endif ++.endm ++ ++ ++// F = D ^ (B and (C xor D)) ++// that is (B and C) or ((not B) and D) ++.macro FUNC_F0_x1 ++ bsl_x1 VF_0,VB_0,VC_0,VD_0,VTMP_0 ++.endm ++ ++.macro FUNC_F0_x2 ++ bsl_x2 VF_0,VB_0,VC_0,VD_0,VTMP_0,VF_1,VB_1,VC_1,VD_1,VTMP_1 ++.endm ++ ++// F = C xor (D and (B xor C)) ++// that is (D and B) or ((not D) and C) ++.macro FUNC_F1_x1 ++ bsl_x1 VF_0,VD_0,VB_0,VC_0,VTMP_0 ++.endm ++ ++.macro FUNC_F1_x2 ++ bsl_x2 VF_0,VD_0,VB_0,VC_0,VTMP_0,VF_1,VD_1,VB_1,VC_1,VTMP_1 ++.endm ++ ++// F := B xor C xor D ++.macro FUNC_F2_x1 ++ .if have_sve2 == 0 ++ eor VF_0.d,VB_0.d,VC_0.d ++ eor VF_0.d,VF_0.d,VD_0.d ++ .else ++ movprfx VF_0.d,VB_0.d ++ eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d ++ .endif ++.endm ++ ++.macro FUNC_F2_x2 ++ .if have_sve2 == 0 ++ eor VF_0.d,VB_0.d,VC_0.d ++ eor VF_1.d,VB_1.d,VC_1.d ++ eor VF_0.d,VF_0.d,VD_0.d ++ eor VF_1.d,VF_1.d,VD_1.d ++ .else ++ movprfx VF_0.d,VB_0.d ++ eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d ++ movprfx VF_1.d,VB_1.d ++ eor3 VF_1.d,VF_1.d,VC_1.d,VD_1.d ++ .endif ++.endm ++ ++// F := C xor (B or (not D)) ++.macro FUNC_F3_x1 ++ not VF_0.s,p0/m,VD_0.s ++ orr VF_0.d,VF_0.d,VB_0.d ++ eor VF_0.d,VF_0.d,VC_0.d ++.endm ++ ++.macro FUNC_F3_x2 ++ not VF_0.s,p0/m,VD_0.s ++ not VF_1.s,p0/m,VD_1.s ++ orr VF_0.d,VF_0.d,VB_0.d ++ orr VF_1.d,VF_1.d,VB_1.d ++ eor VF_0.d,VF_0.d,VC_0.d ++ eor VF_1.d,VF_1.d,VC_1.d ++.endm ++ ++.macro SWAP_STATES ++ .unreq TT ++ TT .req VA_0 ++ .unreq VA_0 ++ VA_0 .req VD_0 ++ .unreq VD_0 ++ VD_0 .req VC_0 ++ .unreq VC_0 ++ VC_0 .req VB_0 ++ .unreq VB_0 ++ VB_0 .req TT ++ ++ .unreq TT ++ TT .req VA_1 ++ .unreq VA_1 ++ VA_1 .req VD_1 ++ .unreq VD_1 ++ VD_1 .req VC_1 ++ .unreq VC_1 ++ VC_1 .req VB_1 ++ .unreq VB_1 ++ VB_1 .req TT ++.endm ++ ++.macro MD5_STEP_x1 windex:req,mg:req,func_f:req,bits:req ++ ld1rw {VK.s},p0/z,[md5key_adr,windex * 4] ++ \func_f\()_x1 ++ add VTMP_0.s,VA_0.s,\mg\()_0.s ++ add VF_0.s,VF_0.s,VK.s ++ add VF_0.s,VF_0.s,VTMP_0.s ++ rotate_left_x1 VA_0,VF_0,VTMP_0,\bits ++ add VA_0.s,VA_0.s,VB_0.s ++.endm ++ ++.macro MD5_STEP_x2 windex:req,mg:req,func_f:req,bits:req ++ ld1rw {VK.s},p0/z,[md5key_adr,windex * 4] ++ \func_f\()_x2 ++ add VTMP_0.s,VA_0.s,\mg\()_0.s ++ add VTMP_1.s,VA_1.s,\mg\()_1.s ++ add VF_0.s,VF_0.s,VK.s ++ add VF_1.s,VF_1.s,VK.s ++ add VF_0.s,VF_0.s,VTMP_0.s ++ add VF_1.s,VF_1.s,VTMP_1.s ++ rotate_left_x2 VA_0,VF_0,VTMP_0,\bits,VA_1,VF_1,VTMP_1,\bits ++ add VA_0.s,VA_0.s,VB_0.s ++ add VA_1.s,VA_1.s,VB_1.s ++.endm ++ ++.altmacro ++.macro load_words index:req,mg:req ++ load_word %num_pipelines,\index,MD5WORD\mg\()_0,MD5WORD\mg\()_1 ++.endm ++ ++.macro MD5_STEP_WRAPPER pipelines:req,windex:req,gindex:req,mg:req,\ ++ func_f:req,bits:req,gindex_next,mg_next ++ .ifnb \gindex_next ++ load_words \gindex_next,\mg_next ++ .endif ++ MD5_STEP_x\pipelines\() \windex,MD5WORD\mg\(),\func_f,\bits ++.endm ++ ++.macro exec_step windex:req,gindex:req,bits:req,gindex_next ++ .if \windex % 2 == 0 ++ mg=0 ++ mg_next=1 ++ .else ++ mg=1 ++ mg_next=0 ++ .endif ++ ++ .if \windex <= 15 ++ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ ++ FUNC_F0,\bits,\gindex_next,%mg_next ++ .endif ++ .if \windex >= 16 && \windex <= 31 ++ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ ++ FUNC_F1,\bits,\gindex_next,%mg_next ++ .endif ++ .if \windex >= 32 && \windex <= 47 ++ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ ++ FUNC_F2,\bits,\gindex_next,%mg_next ++ .endif ++ .if \windex >= 48 && \windex < 63 ++ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ ++ FUNC_F3,\bits,\gindex_next,%mg_next ++ .endif ++ .if \windex == 63 ++ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,FUNC_F3,\bits ++ .endif ++ SWAP_STATES ++.endm ++ ++.macro exec_steps ++ exec_step 0,0,7,1 ++ exec_step 1,1,12,2 ++ exec_step 2,2,17,3 ++ exec_step 3,3,22,4 ++ exec_step 4,4,7,5 ++ exec_step 5,5,12,6 ++ exec_step 6,6,17,7 ++ exec_step 7,7,22,8 ++ exec_step 8,8,7,9 ++ exec_step 9,9,12,10 ++ exec_step 10,10,17,11 ++ exec_step 11,11,22,12 ++ exec_step 12,12,7,13 ++ exec_step 13,13,12,14 ++ exec_step 14,14,17,15 ++ exec_step 15,15,22,1 ++ exec_step 16,1,5,6 ++ exec_step 17,6,9,11 ++ exec_step 18,11,14,0 ++ exec_step 19,0,20,5 ++ exec_step 20,5,5,10 ++ exec_step 21,10,9,15 ++ exec_step 22,15,14,4 ++ exec_step 23,4,20,9 ++ exec_step 24,9,5,14 ++ exec_step 25,14,9,3 ++ exec_step 26,3,14,8 ++ exec_step 27,8,20,13 ++ exec_step 28,13,5,2 ++ exec_step 29,2,9,7 ++ exec_step 30,7,14,12 ++ exec_step 31,12,20,5 ++ exec_step 32,5,4,8 ++ exec_step 33,8,11,11 ++ exec_step 34,11,16,14 ++ exec_step 35,14,23,1 ++ exec_step 36,1,4,4 ++ exec_step 37,4,11,7 ++ exec_step 38,7,16,10 ++ exec_step 39,10,23,13 ++ exec_step 40,13,4,0 ++ exec_step 41,0,11,3 ++ exec_step 42,3,16,6 ++ exec_step 43,6,23,9 ++ exec_step 44,9,4,12 ++ exec_step 45,12,11,15 ++ exec_step 46,15,16,2 ++ exec_step 47,2,23,0 ++ exec_step 48,0,6,7 ++ exec_step 49,7,10,14 ++ exec_step 50,14,15,5 ++ exec_step 51,5,21,12 ++ exec_step 52,12,6,3 ++ exec_step 53,3,10,10 ++ exec_step 54,10,15,1 ++ exec_step 55,1,21,8 ++ exec_step 56,8,6,15 ++ exec_step 57,15,10,6 ++ exec_step 58,6,15,13 ++ exec_step 59,13,21,4 ++ exec_step 60,4,6,11 ++ exec_step 61,11,10,2 ++ exec_step 62,2,15,9 ++ exec_step 63,9,21 ++.endm ++ ++.macro prepare_x1 ++ load_words 0,0 ++ orr VAA_0.d,VA_0.d,VA_0.d ++ orr VBB_0.d,VB_0.d,VB_0.d ++ orr VCC_0.d,VC_0.d,VC_0.d ++ orr VDD_0.d,VD_0.d,VD_0.d ++.endm ++ ++.macro prepare_x2 ++ load_words 0,0 ++ orr VAA_0.d,VA_0.d,VA_0.d ++ orr VAA_1.d,VA_1.d,VA_1.d ++ orr VBB_0.d,VB_0.d,VB_0.d ++ orr VBB_1.d,VB_1.d,VB_1.d ++ orr VCC_0.d,VC_0.d,VC_0.d ++ orr VCC_1.d,VC_1.d,VC_1.d ++ orr VDD_0.d,VD_0.d,VD_0.d ++ orr VDD_1.d,VD_1.d,VD_1.d ++.endm ++ ++.macro finish_x1 ++ add VA_0.s,VA_0.s,VAA_0.s ++ add VB_0.s,VB_0.s,VBB_0.s ++ add VC_0.s,VC_0.s,VCC_0.s ++ add VD_0.s,VD_0.s,VDD_0.s ++.endm ++ ++.macro finish_x2 ++ add VA_0.s,VA_0.s,VAA_0.s ++ add VA_1.s,VA_1.s,VAA_1.s ++ add VB_0.s,VB_0.s,VBB_0.s ++ add VB_1.s,VB_1.s,VBB_1.s ++ add VC_0.s,VC_0.s,VCC_0.s ++ add VC_1.s,VC_1.s,VCC_1.s ++ add VD_0.s,VD_0.s,VDD_0.s ++ add VD_1.s,VD_1.s,VDD_1.s ++.endm ++ ++.macro md5_single pipelines:req,sve2 ++ .ifnb \sve2 ++ have_sve2=1 ++ eor VZERO.d,VZERO.d,VZERO.d ++ .else ++ have_sve2=0 ++ .endif ++ num_pipelines=\pipelines ++ load_init ++ ++ prepare_x\pipelines\() ++ exec_steps ++ finish_x\pipelines\() ++.endm ++ ++.macro md5_sve_save_stack ++ stp d8,d9,[sp, -48]! ++ stp d10,d11,[sp, 16] ++ stp d12,d13,[sp, 32] ++.endm ++ ++.macro md5_sve_restore_stack ++ ldp d10,d11,[sp, 16] ++ ldp d12,d13,[sp, 32] ++ ldp d8,d9,[sp],48 ++.endm ++ ++ .section .rodata.cst16,"aM",@progbits,16 ++ .align 16 ++ ++MD5_CONST_KEYS: ++ .word 0xd76aa478 ++ .word 0xe8c7b756 ++ .word 0x242070db ++ .word 0xc1bdceee ++ .word 0xf57c0faf ++ .word 0x4787c62a ++ .word 0xa8304613 ++ .word 0xfd469501 ++ .word 0x698098d8 ++ .word 0x8b44f7af ++ .word 0xffff5bb1 ++ .word 0x895cd7be ++ .word 0x6b901122 ++ .word 0xfd987193 ++ .word 0xa679438e ++ .word 0x49b40821 ++ .word 0xf61e2562 ++ .word 0xc040b340 ++ .word 0x265e5a51 ++ .word 0xe9b6c7aa ++ .word 0xd62f105d ++ .word 0x02441453 ++ .word 0xd8a1e681 ++ .word 0xe7d3fbc8 ++ .word 0x21e1cde6 ++ .word 0xc33707d6 ++ .word 0xf4d50d87 ++ .word 0x455a14ed ++ .word 0xa9e3e905 ++ .word 0xfcefa3f8 ++ .word 0x676f02d9 ++ .word 0x8d2a4c8a ++ .word 0xfffa3942 ++ .word 0x8771f681 ++ .word 0x6d9d6122 ++ .word 0xfde5380c ++ .word 0xa4beea44 ++ .word 0x4bdecfa9 ++ .word 0xf6bb4b60 ++ .word 0xbebfbc70 ++ .word 0x289b7ec6 ++ .word 0xeaa127fa ++ .word 0xd4ef3085 ++ .word 0x04881d05 ++ .word 0xd9d4d039 ++ .word 0xe6db99e5 ++ .word 0x1fa27cf8 ++ .word 0xc4ac5665 ++ .word 0xf4292244 ++ .word 0x432aff97 ++ .word 0xab9423a7 ++ .word 0xfc93a039 ++ .word 0x655b59c3 ++ .word 0x8f0ccc92 ++ .word 0xffeff47d ++ .word 0x85845dd1 ++ .word 0x6fa87e4f ++ .word 0xfe2ce6e0 ++ .word 0xa3014314 ++ .word 0x4e0811a1 ++ .word 0xf7537e82 ++ .word 0xbd3af235 ++ .word 0x2ad7d2bb ++ .word 0xeb86d391 +diff --git a/drv/hash_mb/sm3_mb_asimd_x1.S b/drv/hash_mb/sm3_mb_asimd_x1.S +new file mode 100644 +index 0000000..c7362de +--- /dev/null ++++ b/drv/hash_mb/sm3_mb_asimd_x1.S +@@ -0,0 +1,387 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE ++ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ .arch armv8.2-a ++ .text ++ .align 2 ++ .p2align 3,,7 ++ ++.macro declare_var_vector_reg name:req,reg:req ++ q\name\() .req q\reg ++ v\name\() .req v\reg ++ s\name\() .req s\reg ++.endm ++ ++ job .req x0 ++ len .req x1 ++ data .req x2 ++ digest .req x0 ++ ++ msg0 .req w3 ++ msg1 .req w4 ++ msg2 .req w5 ++ msg3 .req w6 ++ msg4 .req w7 ++ ++ msg .req w9 ++ msgP .req w10 ++ SS1 .req w11 ++ SS2 .req w12 ++ TT1 .req w13 ++ TT2 .req w14 ++ Tj .req w15 ++ tmp0 .req w19 ++ tmp1 .req w20 ++ dig_A .req w21 ++ dig_B .req w22 ++ dig_C .req w23 ++ dig_D .req w24 ++ dig_E .req w25 ++ dig_F .req w26 ++ dig_G .req w27 ++ dig_H .req w28 ++ ++ declare_var_vector_reg dig0,0 ++ declare_var_vector_reg dig1,1 ++ declare_var_vector_reg dig0_bak,2 ++ declare_var_vector_reg dig1_bak,3 ++ declare_var_vector_reg vect_msg0,4 ++ declare_var_vector_reg vect_msg1,5 ++ declare_var_vector_reg vect_msg2,6 ++ declare_var_vector_reg vect_msg3,7 ++ ++ declare_var_vector_reg vect_msgP0,16 ++ declare_var_vector_reg vect_msgP1,17 ++ declare_var_vector_reg vect_msgP2,18 ++ ++ ++ ++ ++ ++ ++// round 0-11 ++.macro sm3_round_0 round:req ++ ldr msg, [sp,msg_off+4*\round\()] ++ ldr msgP,[sp,wp_off +4*\round\()] ++ add SS1,dig_E,Tj ++ ror TT1,dig_A,32-12 ++ add SS1,SS1,TT1 ++ ror SS1,SS1,32-7 //SS1 done ++ eor SS2,SS1,TT1 //SS2 done ++ eor TT1,dig_A,dig_B ++ eor TT2,dig_E,dig_F ++ add SS2,SS2,msgP ++ eor TT2,TT2,dig_G ++ add SS1,SS1,msg ++ eor TT1,TT1,dig_C ++ add SS2,SS2,dig_D ++ add SS1,SS1,dig_H ++ add TT1,TT1,SS2 ++ add TT2,TT2,SS1 ++ mov dig_D,dig_C ++ ror dig_C,dig_B,32-9 ++ mov dig_B,dig_A ++ mov dig_A,TT1 ++ eor TT1,TT2,TT2,ror (32-17) ++ mov dig_H,dig_G ++ ror dig_G,dig_F,32-19 ++ mov dig_F,dig_E ++ eor dig_E,TT1,TT2,ror(32-9) ++ ror Tj,Tj,(32-1) ++.endm ++ ++//round 12-15 ++.macro sm3_round_12 round:req ++ ldr msg, [sp,msg_off+4*((\round\())%17)] ++ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] ++ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] ++ add SS1,dig_E,Tj ++ ror TT1,dig_A,32-12 ++ add SS1,SS1,TT1 ++ ror SS1,SS1,32-7 //SS1 done ++ eor SS2,SS1,TT1 //SS2 done ++ ++ eor msg0,msg0,msg1 ++ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] ++ eor TT1,dig_A,dig_B ++ eor TT2,dig_E,dig_F ++ add SS2,SS2,dig_D ++ eor TT2,TT2,dig_G ++ add SS1,SS1,msg ++ eor msg0,msg0,msg2,ror (32-15) ++ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] ++ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] ++ eor msg1,msg0,msg0,ror (32 -15) ++ eor TT1,TT1,dig_C ++ add TT1,TT1,SS2 ++ eor msg4,msg4,msg3, ror (32-7) ++ eor msg0,msg1,msg0, ror (32-23) ++ add SS1,SS1,dig_H ++ eor msg0,msg0,msg4 ++ add TT2,TT2,SS1 ++ mov dig_D,dig_C ++ str msg0,[sp,msg_off+4*((\round\()+4)%17)] ++ eor msgP,msg,msg0 ++ add TT1,TT1,msgP ++ ror dig_C,dig_B,32-9 ++ mov dig_B,dig_A ++ mov dig_A,TT1 ++ eor TT1,TT2,TT2,ror (32-17) ++ mov dig_H,dig_G ++ ror dig_G,dig_F,32-19 ++ mov dig_F,dig_E ++ eor dig_E,TT1,TT2,ror(32-9) ++ ror Tj,Tj,32-1 ++.endm ++ ++// round 16-62 ++.macro sm3_round_16 round:req ++ ldr msg, [sp,msg_off+4*((\round\())%17)] ++ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] ++ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] ++ add SS1,dig_E,Tj ++ ror TT1,dig_A,32-12 ++ add SS1,SS1,TT1 ++ ror SS1,SS1,32-7 //SS1 done ++ eor SS2,SS1,TT1 //SS2 done ++ ++ eor msg0,msg0,msg1 ++ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] ++ orr TT1,dig_B,dig_C ++ and tmp0,dig_B,dig_C ++ ++ eor TT2,dig_F,dig_G ++ and TT1,TT1,dig_A ++ add SS2,SS2,dig_D ++ orr TT1,TT1,tmp0 ++ and TT2,TT2,dig_E ++ add SS1,SS1,msg ++ eor TT2,TT2,dig_G ++ ++ eor msg0,msg0,msg2,ror (32-15) ++ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] ++ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] ++ eor msg1,msg0,msg0,ror (32 -15) ++ add TT1,TT1,SS2 ++ eor msg4,msg4,msg3, ror (32-7) ++ eor msg0,msg1,msg0, ror (32-23) ++ add SS1,SS1,dig_H ++ eor msg0,msg0,msg4 ++ add TT2,TT2,SS1 ++ mov dig_D,dig_C ++ str msg0,[sp,msg_off+4*((\round\()+4)%17)] ++ eor msgP,msg,msg0 ++ add TT1,TT1,msgP ++ ror dig_C,dig_B,32-9 ++ mov dig_B,dig_A ++ mov dig_A,TT1 ++ eor TT1,TT2,TT2,ror (32-17) ++ mov dig_H,dig_G ++ ror dig_G,dig_F,32-19 ++ mov dig_F,dig_E ++ eor dig_E,TT1,TT2,ror(32-9) ++ ror Tj,Tj,32-1 ++.endm ++ ++//round 63 ++.macro sm3_round_63 round:req ++ ldr msg, [sp,msg_off+4*((\round\())%17)] ++ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)] ++ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)] ++ add SS1,dig_E,Tj ++ ror TT1,dig_A,32-12 ++ add SS1,SS1,TT1 ++ ror SS1,SS1,32-7 //SS1 done ++ eor SS2,SS1,TT1 //SS2 done ++ eor msg0,msg0,msg1 ++ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)] ++ orr TT1,dig_B,dig_C ++ and tmp0,dig_B,dig_C ++ eor TT2,dig_F,dig_G ++ and TT1,TT1,dig_A ++ add SS2,SS2,dig_D ++ orr TT1,TT1,tmp0 ++ and TT2,TT2,dig_E ++ add SS1,SS1,msg ++ eor TT2,TT2,dig_G ++ eor msg0,msg0,msg2,ror (32-15) ++ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)] ++ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)] ++ eor msg1,msg0,msg0,ror (32 -15) ++ add TT1,TT1,SS2 ++ eor msg4,msg4,msg3, ror (32-7) ++ eor msg0,msg1,msg0, ror (32-23) ++ add SS1,SS1,dig_H ++ eor msg0,msg0,msg4 ++ add TT2,TT2,SS1 ++ str msg0,[sp,msg_off+4*((\round\()+4)%17)] ++ eor msgP,msg,msg0 ++ add TT1,TT1,msgP ++ ins vdig0_bak.s[3],dig_C ++ ror dig_C,dig_B,32-9 ++ ins vdig0_bak.s[1],dig_A ++ ins vdig0_bak.s[0],TT1 ++ ins vdig0_bak.s[2],dig_C ++ eor TT1,TT2,TT2,ror (32-17) ++ ins vdig1_bak.s[3],dig_G ++ ror dig_G,dig_F,32-19 ++ ins vdig1_bak.s[1],dig_E ++ ins vdig1_bak.s[2],dig_G ++ eor dig_E,TT1,TT2,ror(32-9) ++ ins vdig1_bak.s[0],dig_E ++.endm ++ ++ .set wp_off , 96 ++ .set msg_off, 96 + 12*4 ++#define STACK_SIZE 224 ++ .global sm3_mb_asimd_x1 ++ .type sm3_mb_asimd_x1, %function ++sm3_mb_asimd_x1: ++ stp x29,x30, [sp,-STACK_SIZE]! ++ cmp len,0 ++ ldr data,[job],64 ++ ldp qdig0,qdig1,[digest] ++ stp x19, x20, [sp, 16] ++ stp x21, x22, [sp, 32] ++ rev32 vdig0.16b,vdig0.16b ++ stp x23, x24, [sp, 48] ++ rev32 vdig1.16b,vdig1.16b ++ stp x25, x26, [sp, 64] ++ stp x27, x28, [sp, 80] ++ ble .exit_func ++ ++.start_loop: ++ ++ /** prepare first 12 round data **/ ++ ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64 ++ mov Tj, 17689 ++ umov dig_A,vdig0.s[0] ++ movk Tj, 0x79cc, lsl 16 ++ rev32 vvect_msg0.16b,vvect_msg0.16b ++ umov dig_B,vdig0.s[1] ++ rev32 vvect_msg1.16b,vvect_msg1.16b ++ umov dig_C,vdig0.s[2] ++ rev32 vvect_msg2.16b,vvect_msg2.16b ++ umov dig_D,vdig0.s[3] ++ rev32 vvect_msg3.16b,vvect_msg3.16b ++ umov dig_E,vdig1.s[0] ++ stp qvect_msg0,qvect_msg1,[sp,msg_off] ++ umov dig_F,vdig1.s[1] ++ stp qvect_msg2,qvect_msg3,[sp,msg_off+32] ++ umov dig_G,vdig1.s[2] ++ eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b ++ eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b ++ umov dig_H,vdig1.s[3] ++ stp qvect_msgP0,qvect_msgP1,[sp,wp_off] ++ eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b ++ str qvect_msgP2,[sp,wp_off+32] ++ ++ sm3_round_0 0 ++ sm3_round_0 1 ++ sm3_round_0 2 ++ sm3_round_0 3 ++ sm3_round_0 4 ++ sm3_round_0 5 ++ sm3_round_0 6 ++ sm3_round_0 7 ++ sm3_round_0 8 ++ sm3_round_0 9 ++ sm3_round_0 10 ++ sm3_round_0 11 ++ ++ sm3_round_12 12 ++ sm3_round_12 13 ++ sm3_round_12 14 ++ sm3_round_12 15 ++ mov Tj, 0x7a87 ++ movk Tj, 0x9d8a, lsl 16 ++ sm3_round_16 16 ++ sm3_round_16 17 ++ sm3_round_16 18 ++ sm3_round_16 19 ++ sm3_round_16 20 ++ sm3_round_16 21 ++ sm3_round_16 22 ++ sm3_round_16 23 ++ sm3_round_16 24 ++ sm3_round_16 25 ++ sm3_round_16 26 ++ sm3_round_16 27 ++ sm3_round_16 28 ++ sm3_round_16 29 ++ sm3_round_16 30 ++ sm3_round_16 31 ++ sm3_round_16 32 ++ sm3_round_16 33 ++ sm3_round_16 34 ++ sm3_round_16 35 ++ sm3_round_16 36 ++ sm3_round_16 37 ++ sm3_round_16 38 ++ sm3_round_16 39 ++ sm3_round_16 40 ++ sm3_round_16 41 ++ sm3_round_16 42 ++ sm3_round_16 43 ++ sm3_round_16 44 ++ sm3_round_16 45 ++ sm3_round_16 46 ++ sm3_round_16 47 ++ sm3_round_16 48 ++ sm3_round_16 49 ++ sm3_round_16 50 ++ sm3_round_16 51 ++ sm3_round_16 52 ++ sm3_round_16 53 ++ sm3_round_16 54 ++ sm3_round_16 55 ++ sm3_round_16 56 ++ sm3_round_16 57 ++ sm3_round_16 58 ++ sm3_round_16 59 ++ sm3_round_16 60 ++ sm3_round_16 61 ++ sm3_round_16 62 ++ sm3_round_63 63 ++ subs len,len,1 ++ eor vdig0.16b,vdig0.16b,vdig0_bak.16b ++ eor vdig1.16b,vdig1.16b,vdig1_bak.16b ++ bne .start_loop ++.exit_func: ++ ldp x19, x20, [sp, 16] ++ rev32 vdig0.16b,vdig0.16b ++ ldp x21, x22, [sp, 32] ++ rev32 vdig1.16b,vdig1.16b ++ ldp x23, x24, [sp, 48] ++ stp qdig0,qdig1,[digest] ++ ldp x25, x26, [sp, 64] ++ ldp x27, x28, [sp, 80] ++ ldp x29, x30, [sp], STACK_SIZE ++ ret ++ .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1 ++ +diff --git a/drv/hash_mb/sm3_mb_asimd_x4.S b/drv/hash_mb/sm3_mb_asimd_x4.S +new file mode 100644 +index 0000000..975a07c +--- /dev/null ++++ b/drv/hash_mb/sm3_mb_asimd_x4.S +@@ -0,0 +1,576 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE ++ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ .arch armv8.2-a ++ .text ++ .align 2 ++ .p2align 3,,7 ++ ++.macro declare_var_vector_reg name:req,reg:req ++ q\name\() .req q\reg ++ v\name\() .req v\reg ++ s\name\() .req s\reg ++.endm ++ ++ job0 .req x0 ++ job1 .req x1 ++ job2 .req x2 ++ job3 .req x3 ++ len .req x4 ++ ++ job0_data .req x5 ++ job1_data .req x6 ++ job2_data .req x7 ++ job3_data .req x9 ++ ++ job0_digest .req x0 ++ job1_digest .req x1 ++ job2_digest .req x2 ++ job3_digest .req x3 ++ job0_tmp .req x10 ++ job1_tmp .req x11 ++ job2_tmp .req x12 ++ job3_tmp .req x13 ++ const_adr .req x14 ++ ++ ++ declare_var_vector_reg msg0,0 ++ declare_var_vector_reg msg1,1 ++ declare_var_vector_reg msg2,2 ++ declare_var_vector_reg msg3,3 ++ declare_var_vector_reg msg4,4 ++ declare_var_vector_reg msg5,5 ++ declare_var_vector_reg msg6,6 ++ declare_var_vector_reg msg7,7 ++ declare_var_vector_reg msg8,8 ++ declare_var_vector_reg msg9,9 ++ declare_var_vector_reg msg10,10 ++ declare_var_vector_reg msg11,11 ++ declare_var_vector_reg msg12,12 ++ declare_var_vector_reg msg13,13 ++ declare_var_vector_reg msg14,14 ++ declare_var_vector_reg msg15,15 ++ declare_var_vector_reg msg16,16 ++ ++ ++ declare_var_vector_reg dig_A,24 ++ declare_var_vector_reg dig_B,25 ++ declare_var_vector_reg dig_C,26 ++ declare_var_vector_reg dig_D,27 ++ declare_var_vector_reg dig_E,28 ++ declare_var_vector_reg dig_F,29 ++ declare_var_vector_reg dig_G,30 ++ declare_var_vector_reg dig_H,31 ++ ++ declare_var_vector_reg TT1,17 ++ declare_var_vector_reg TT2,18 ++ declare_var_vector_reg SS1,19 ++ declare_var_vector_reg SS2,20 ++ declare_var_vector_reg tmp0,21 ++ declare_var_vector_reg word_pair,23 ++ declare_var_vector_reg Tj,22 ++ ++ ++.macro rol32 target:req,reg:req,bit:req ++ ushr v\target\().4s,v\reg\().4s,32 - \bit ++ sli v\target\().4s,v\reg\().4s,\bit ++.endm ++ ++// round 0-11 ++.macro sm3_round_0 round:req,wp:req ++ ++ ushr vtmp0.4s,vdig_A.4s,32 - 12 ++ ++ add vSS1.4s,vdig_E.4s,vTj.4s ++ sli vtmp0.4s,vdig_A.4s,12 ++ rev32 vmsg\round\().16b,vmsg\round\().16b ++ rev32 vmsg\wp\().16b,vmsg\wp\().16b ++ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done ++ rol32 SS1,TT1,7 ++ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done ++ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b ++ ++ eor vTT1.16b,vdig_A.16b,vdig_B.16b ++ eor vTT2.16b,vdig_E.16b,vdig_F.16b ++ eor vTT1.16b,vTT1.16b,vdig_C.16b ++ eor vTT2.16b,vTT2.16b,vdig_G.16b ++ ++ add vSS1.4s,vSS1.4s,vmsg\round\().4s ++ add vSS2.4s,vSS2.4s,vword_pair.4s ++ add vTT1.4s,vTT1.4s,vdig_D.4s ++ add vTT2.4s,vTT2.4s,vdig_H.4s ++ ushr vtmp0.4s,vTj.4s,32-1 ++ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done ++ sli vtmp0.4s,vTj.4s,1 ++ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done ++ mov vTj.16b,vtmp0.16b ++ //D=C ++ mov vdig_D.16b,vdig_C.16b ++ //C = ROTL32(B, 9); ++ ushr vdig_C.4s,vdig_B.4s,32 - 9 ++ sli vdig_C.4s,vdig_B.4s,9 ++ //B=A ++ mov vdig_B.16b,vdig_A.16b ++ //A=TT1 ++ mov vdig_A.16b,vTT1.16b ++ // H=G ++ mov vdig_H.16b,vdig_G.16b ++ //G = ROTL32(F,19) ++ rol32 dig_G,dig_F,19 ++ //F = E ++ mov vdig_F.16b,vdig_E.16b ++ // E=Target, TT2=src, TT1,SS1,SS2 is free ++ // E = P0(TT2); ++ ushr vSS2.4s, vTT2.4s, 32 - 9 ++ ushr vSS1.4s, vTT2.4s, 32 - 17 ++ sli vSS2.4s, vTT2.4s, 9 ++ sli vSS1.4s, vTT2.4s, 17 ++ eor vdig_E.16b, vTT2.16b, vSS1.16b ++ eor vdig_E.16b, vdig_E.16b, vSS2.16b ++ ++.endm ++ ++ ++.macro sm3_round_4 round:req,wp:req ++ ++ ushr vtmp0.4s,vdig_A.4s,32 - 12 ++ add vSS1.4s,vdig_E.4s,vTj.4s ++ sli vtmp0.4s,vdig_A.4s,12 ++ rev32 vmsg\wp\().16b,vmsg\wp\().16b ++ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done ++ rol32 SS1,TT1,7 ++ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done ++ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b ++ eor vTT1.16b,vdig_A.16b,vdig_B.16b ++ eor vTT2.16b,vdig_E.16b,vdig_F.16b ++ eor vTT1.16b,vTT1.16b,vdig_C.16b ++ eor vTT2.16b,vTT2.16b,vdig_G.16b ++ add vSS1.4s,vSS1.4s,vmsg\round\().4s ++ add vSS2.4s,vSS2.4s,vword_pair.4s ++ add vTT1.4s,vTT1.4s,vdig_D.4s ++ add vTT2.4s,vTT2.4s,vdig_H.4s ++ ushr vtmp0.4s,vTj.4s,32-1 ++ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done ++ sli vtmp0.4s,vTj.4s,1 ++ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done ++ mov vTj.16b,vtmp0.16b ++ //D=C ++ mov vdig_D.16b,vdig_C.16b ++ //C = ROTL32(B, 9); ++ ushr vdig_C.4s,vdig_B.4s,32 - 9 ++ sli vdig_C.4s,vdig_B.4s,9 ++ //B=A ++ mov vdig_B.16b,vdig_A.16b ++ //A=TT1 ++ mov vdig_A.16b,vTT1.16b ++ // H=G ++ mov vdig_H.16b,vdig_G.16b ++ //G = ROTL32(F,19) ++ rol32 dig_G,dig_F,19 ++ //F = E ++ mov vdig_F.16b,vdig_E.16b ++ // E=Target, TT2=src, TT1,SS1,SS2 is free ++ // E = P0(TT2); ++ ushr vSS2.4s, vTT2.4s, 32 - 9 ++ ushr vSS1.4s, vTT2.4s, 32 - 17 ++ sli vSS2.4s, vTT2.4s, 9 ++ sli vSS1.4s, vTT2.4s, 17 ++ eor vdig_E.16b, vTT2.16b, vSS1.16b ++ eor vdig_E.16b, vdig_E.16b, vSS2.16b ++ ++.endm ++ ++//round 12-15 ++.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4 ++ rol32 msg\plus_4,msg\m2,15 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b ++ rol32 tmp0,msg\plus_4,15 ++ rol32 word_pair,msg\plus_4,23 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b ++ rol32 tmp0,msg\m3,7 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b ++ ushr vtmp0.4s,vdig_A.4s,32 - 12 ++ sli vtmp0.4s,vdig_A.4s,12 ++ add vSS1.4s,vdig_E.4s,vTj.4s ++ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done ++ rol32 SS1,SS2,7 ++ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done ++ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b ++ eor vTT1.16b,vdig_A.16b,vdig_B.16b ++ eor vTT1.16b,vTT1.16b,vdig_C.16b ++ eor vTT2.16b,vdig_E.16b,vdig_F.16b ++ eor vTT2.16b,vTT2.16b,vdig_G.16b ++ add vSS1.4s,vSS1.4s,vmsg\round\().4s ++ add vSS2.4s,vSS2.4s,vword_pair.4s ++ add vTT1.4s,vTT1.4s,vdig_D.4s ++ add vTT2.4s,vTT2.4s,vdig_H.4s ++ ushr vtmp0.4s,vTj.4s,32-1 ++ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done ++ sli vtmp0.4s,vTj.4s,1 ++ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done ++ mov vTj.16b,vtmp0.16b ++ //D=C ++ mov vdig_D.16b,vdig_C.16b ++ //C = ROTL32(B, 9); ++ ushr vdig_C.4s,vdig_B.4s,32 - 9 ++ sli vdig_C.4s,vdig_B.4s,9 ++ //B=A ++ mov vdig_B.16b,vdig_A.16b ++ //A=TT1 ++ mov vdig_A.16b,vTT1.16b ++ // H=G ++ mov vdig_H.16b,vdig_G.16b ++ //G = ROTL32(F,19) ++ rol32 dig_G,dig_F,19 ++ //F = E ++ mov vdig_F.16b,vdig_E.16b ++ // E=Target, TT2=src, TT1,SS1,SS2 is free ++ // E = P0(TT2); ++ ushr vSS2.4s, vTT2.4s, 32 - 9 ++ ushr vSS1.4s, vTT2.4s, 32 - 17 ++ sli vSS2.4s, vTT2.4s, 9 ++ sli vSS1.4s, vTT2.4s, 17 ++ eor vdig_E.16b, vTT2.16b, vSS1.16b ++ eor vdig_E.16b, vdig_E.16b, vSS2.16b ++.endm ++ ++// round 16-62 ++.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4 ++ rol32 msg\plus_4,msg\m2,15 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b ++ rol32 tmp0,msg\plus_4,15 ++ rol32 word_pair,msg\plus_4,23 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b ++ rol32 tmp0,msg\m3,7 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b ++ ushr vtmp0.4s,vdig_A.4s,32 - 12 ++ sli vtmp0.4s,vdig_A.4s,12 ++ add vSS1.4s,vdig_E.4s,vTj.4s ++ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done ++ rol32 SS1,SS2,7 ++ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done ++ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b ++ mov vTT2.16b,vdig_E.16b ++ orr vTT1.16b,vdig_B.16b,vdig_C.16b ++ and vtmp0.16b,vdig_B.16b,vdig_C.16b ++ bsl vTT2.16b,vdig_F.16b,vdig_G.16b ++ and vTT1.16b,vTT1.16b,vdig_A.16b ++ add vSS1.4s,vSS1.4s,vmsg\round\().4s ++ orr vTT1.16b,vTT1.16b,vtmp0.16b ++ add vSS2.4s,vSS2.4s,vword_pair.4s ++ add vTT1.4s,vTT1.4s,vdig_D.4s ++ add vTT2.4s,vTT2.4s,vdig_H.4s ++ ushr vtmp0.4s,vTj.4s,32-1 ++ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done ++ sli vtmp0.4s,vTj.4s,1 ++ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done ++ mov vTj.16b,vtmp0.16b ++ //D=C ++ mov vdig_D.16b,vdig_C.16b ++ //C = ROTL32(B, 9); ++ ushr vdig_C.4s,vdig_B.4s,32 - 9 ++ sli vdig_C.4s,vdig_B.4s,9 ++ //B=A ++ mov vdig_B.16b,vdig_A.16b ++ //A=TT1 ++ mov vdig_A.16b,vTT1.16b ++ // H=G ++ mov vdig_H.16b,vdig_G.16b ++ //G = ROTL32(F,19) ++ rol32 dig_G,dig_F,19 ++ //F = E ++ mov vdig_F.16b,vdig_E.16b ++ // E=Target, TT2=src, TT1,SS1,SS2 is free ++ // E = P0(TT2); ++ ushr vSS2.4s, vTT2.4s, 32 - 9 ++ ushr vSS1.4s, vTT2.4s, 32 - 17 ++ sli vSS2.4s, vTT2.4s, 9 ++ sli vSS1.4s, vTT2.4s, 17 ++ eor vdig_E.16b, vTT2.16b, vSS1.16b ++ eor vdig_E.16b, vdig_E.16b, vSS2.16b ++.endm ++ ++//round 63 ++.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4 ++ rol32 msg\plus_4,msg\m2,15 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b ++ rol32 tmp0,msg\plus_4,15 ++ rol32 word_pair,msg\plus_4,23 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b ++ rol32 tmp0,msg\m3,7 ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b ++ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b ++ ushr vtmp0.4s,vdig_A.4s,32 - 12 ++ sli vtmp0.4s,vdig_A.4s,12 ++ add vSS1.4s,vdig_E.4s,vTj.4s ++ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done ++ rol32 SS1,SS2,7 ++ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done ++ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b ++ ++ ldp qmsg0,qmsg1,[sp,dig_off+ 0] ++ mov vTT2.16b,vdig_E.16b ++ ldp qmsg2,qmsg3,[sp,dig_off+ 32] ++ orr vTT1.16b,vdig_B.16b,vdig_C.16b ++ ldp qmsg4,qmsg5,[sp,dig_off+ 64] ++ and vtmp0.16b,vdig_B.16b,vdig_C.16b ++ bsl vTT2.16b,vdig_F.16b,vdig_G.16b ++ ldp qmsg6,qmsg7,[sp,dig_off+ 96] ++ and vTT1.16b,vTT1.16b,vdig_A.16b ++ add vSS1.4s,vSS1.4s,vmsg\round\().4s ++ orr vTT1.16b,vTT1.16b,vtmp0.16b ++ add vSS2.4s,vSS2.4s,vword_pair.4s ++ add vTT1.4s,vTT1.4s,vdig_D.4s ++ add vTT2.4s,vTT2.4s,vdig_H.4s ++ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done ++ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done ++ //D=C ++ eor vdig_D.16b,vdig_C.16b,vmsg3.16b ++ //C = ROTL32(B, 9); ++ ushr vdig_C.4s,vdig_B.4s,32 - 9 ++ sli vdig_C.4s,vdig_B.4s,9 ++ eor vdig_C.16b,vdig_C.16b,vmsg2.16b ++ //B=A ++ eor vdig_B.16b,vdig_A.16b,vmsg1.16b ++ stp qdig_C,qdig_D,[sp,dig_off+ 32] ++ //A=TT1 ++ eor vdig_A.16b,vTT1.16b,vmsg0.16b ++ // H=G ++ eor vdig_H.16b,vdig_G.16b,vmsg7.16b ++ stp qdig_A,qdig_B,[sp,dig_off+ 0] ++ //G = ROTL32(F,19) ++ rol32 dig_G,dig_F,19 ++ eor vdig_G.16b,vdig_G.16b,vmsg6.16b ++ //F = E ++ eor vdig_F.16b,vdig_E.16b,vmsg5.16b ++ stp qdig_G,qdig_H,[sp,dig_off+ 96] ++ // E=Target, TT2=src, TT1,SS1,SS2 is free ++ // E = P0(TT2); ++ ushr vSS2.4s, vTT2.4s, 32 - 9 ++ ushr vSS1.4s, vTT2.4s, 32 - 17 ++ sli vSS2.4s, vTT2.4s, 9 ++ sli vSS1.4s, vTT2.4s, 17 ++ eor vdig_E.16b, vTT2.16b, vSS1.16b ++ eor vdig_E.16b, vdig_E.16b, vSS2.16b ++ eor vdig_E.16b, vdig_E.16b, vmsg4.16b ++ stp qdig_E,qdig_F,[sp,dig_off+ 64] ++.endm ++ ++ .set dig_off , 80 ++ ++#define STACK_SIZE 224 ++ .global sm3_mb_asimd_x4 ++ .type sm3_mb_asimd_x4, %function ++sm3_mb_asimd_x4: ++ stp x29,x30, [sp,-STACK_SIZE]! ++ cmp len,0 ++ //push d8~d15 ++ ldr job0_data, [job0],64 ++ stp d8,d9, [sp,16] ++ ldr job1_data, [job1],64 ++ stp d10,d11,[sp,32] ++ ldr job2_data, [job2],64 ++ stp d12,d13,[sp,48] ++ ldr job3_data, [job3],64 ++ stp d14,d15,[sp,64] ++ ble .exit_func ++ ++ mov job0_tmp,job0_digest ++ mov job1_tmp,job1_digest ++ mov job2_tmp,job2_digest ++ mov job3_tmp,job3_digest ++ //load digests ++ ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16 ++ ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16 ++ ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16 ++ adrp const_adr, .consts ++ ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16 ++ add const_adr, const_adr, #:lo12:.consts ++ ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp] ++ rev32 vdig_A.16b,vdig_A.16b ++ ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp] ++ rev32 vdig_B.16b,vdig_B.16b ++ ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp] ++ rev32 vdig_C.16b,vdig_C.16b ++ ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp] ++ rev32 vdig_D.16b,vdig_D.16b ++ stp qdig_A,qdig_B,[sp,dig_off+ 0] ++ rev32 vdig_E.16b,vdig_E.16b ++ rev32 vdig_F.16b,vdig_F.16b ++ stp qdig_C,qdig_D,[sp,dig_off+ 32] ++ rev32 vdig_G.16b,vdig_G.16b ++ rev32 vdig_H.16b,vdig_H.16b ++ stp qdig_E,qdig_F,[sp,dig_off+ 64] ++ stp qdig_G,qdig_H,[sp,dig_off+ 96] ++ ++.start_loop: ++ ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16 ++ ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16 ++ ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16 ++ ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16 ++ ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16 ++ ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16 ++ ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16 ++ ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16 ++ ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16 ++ ldr qTj,[const_adr] ++ ++ sm3_round_0 0, 4 ++ ++ ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16 ++ sm3_round_0 1, 5 ++ ++ ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16 ++ sm3_round_0 2, 6 ++ ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16 ++ sm3_round_0 3, 7 ++ ++ ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16 ++ ++ sm3_round_4 4, 8 ++ ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16 ++ sm3_round_4 5, 9 ++ ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16 ++ sm3_round_4 6,10 ++ ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16 ++ sm3_round_4 7,11 ++ sm3_round_4 8,12 ++ sm3_round_4 9,13 ++ sm3_round_4 10,14 ++ sm3_round_4 11,15 ++ ++ sm3_round_12 12,16, 0, 7,13, 3,10 //12 ++ sm3_round_12 13, 0, 1, 8,14, 4,11 //13 ++ sm3_round_12 14, 1, 2, 9,15, 5,12 //14 ++ sm3_round_12 15, 2, 3,10,16, 6,13 //15 ++ ++ ldr qTj,[const_adr,16] ++ sm3_round_16 16, 3, 4,11, 0, 7,14 //16 ++#if 0 ++ stp sdig_A,sdig_B,[job0_digest] ++ stp sdig_C,sdig_D,[job0_digest,8] ++ stp sdig_E,sdig_F,[job0_digest,16] ++ stp sdig_G,sdig_H,[job0_digest,24] ++ b .exit_func ++#endif ++ sm3_round_16 0, 4, 5,12, 1, 8,15 //17 ++ ++ sm3_round_16 1, 5, 6,13, 2, 9,16 //18 ++ sm3_round_16 2, 6, 7,14, 3,10, 0 //19 ++ sm3_round_16 3, 7, 8,15, 4,11, 1 //20 ++ sm3_round_16 4, 8, 9,16, 5,12, 2 //21 ++ sm3_round_16 5, 9,10, 0, 6,13, 3 //22 ++ sm3_round_16 6,10,11, 1, 7,14, 4 //23 ++ sm3_round_16 7,11,12, 2, 8,15, 5 //24 ++ sm3_round_16 8,12,13, 3, 9,16, 6 //25 ++ sm3_round_16 9,13,14, 4,10, 0, 7 //26 ++ sm3_round_16 10,14,15, 5,11, 1, 8 //27 ++ sm3_round_16 11,15,16, 6,12, 2, 9 //28 ++ sm3_round_16 12,16, 0, 7,13, 3,10 //29 ++ sm3_round_16 13, 0, 1, 8,14, 4,11 //30 ++ sm3_round_16 14, 1, 2, 9,15, 5,12 //31 ++ sm3_round_16 15, 2, 3,10,16, 6,13 //32 ++ sm3_round_16 16, 3, 4,11, 0, 7,14 //33 ++ sm3_round_16 0, 4, 5,12, 1, 8,15 //34 ++ sm3_round_16 1, 5, 6,13, 2, 9,16 //35 ++ sm3_round_16 2, 6, 7,14, 3,10, 0 //36 ++ sm3_round_16 3, 7, 8,15, 4,11, 1 //37 ++ sm3_round_16 4, 8, 9,16, 5,12, 2 //38 ++ sm3_round_16 5, 9,10, 0, 6,13, 3 //39 ++ sm3_round_16 6,10,11, 1, 7,14, 4 //40 ++ sm3_round_16 7,11,12, 2, 8,15, 5 //41 ++ sm3_round_16 8,12,13, 3, 9,16, 6 //42 ++ sm3_round_16 9,13,14, 4,10, 0, 7 //43 ++ sm3_round_16 10,14,15, 5,11, 1, 8 //44 ++ sm3_round_16 11,15,16, 6,12, 2, 9 //45 ++ sm3_round_16 12,16, 0, 7,13, 3,10 //46 ++ sm3_round_16 13, 0, 1, 8,14, 4,11 //47 ++ sm3_round_16 14, 1, 2, 9,15, 5,12 //48 ++ sm3_round_16 15, 2, 3,10,16, 6,13 //49 ++ sm3_round_16 16, 3, 4,11, 0, 7,14 //50 ++ sm3_round_16 0, 4, 5,12, 1, 8,15 //51 ++ sm3_round_16 1, 5, 6,13, 2, 9,16 //52 ++ sm3_round_16 2, 6, 7,14, 3,10, 0 //53 ++ sm3_round_16 3, 7, 8,15, 4,11, 1 //54 ++ sm3_round_16 4, 8, 9,16, 5,12, 2 //55 ++ sm3_round_16 5, 9,10, 0, 6,13, 3 //56 ++ sm3_round_16 6,10,11, 1, 7,14, 4 //57 ++ sm3_round_16 7,11,12, 2, 8,15, 5 //58 ++ sm3_round_16 8,12,13, 3, 9,16, 6 //59 ++ sm3_round_16 9,13,14, 4,10, 0, 7 //60 ++ sm3_round_16 10,14,15, 5,11, 1, 8 //61 ++ sm3_round_16 11,15,16, 6,12, 2, 9 //62 ++ sm3_round_63 12,16, 0, 7,13, 3,10 //63 ++ ++ subs len,len,1 ++ bne .start_loop ++ ++ //save digests with big endian ++ rev32 vdig_A.16b,vdig_A.16b ++ rev32 vdig_B.16b,vdig_B.16b ++ rev32 vdig_C.16b,vdig_C.16b ++ rev32 vdig_D.16b,vdig_D.16b ++ st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16 ++ rev32 vdig_E.16b,vdig_E.16b ++ rev32 vdig_F.16b,vdig_F.16b ++ st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16 ++ rev32 vdig_G.16b,vdig_G.16b ++ rev32 vdig_H.16b,vdig_H.16b ++ st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16 ++ st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16 ++ st4 {vdig_E.s-vdig_H.s}[0],[job0_digest] ++ st4 {vdig_E.s-vdig_H.s}[1],[job1_digest] ++ st4 {vdig_E.s-vdig_H.s}[2],[job2_digest] ++ st4 {vdig_E.s-vdig_H.s}[3],[job3_digest] ++ ++.exit_func: ++ ldp d8, d9, [sp,16] ++ ldp d10,d11,[sp,32] ++ ldp d12,d13,[sp,48] ++ ldp d14,d15,[sp,64] ++ ldp x29, x30, [sp], STACK_SIZE ++ ret ++.consts: ++ .word 0x79cc4519 ++ .word 0x79cc4519 ++ .word 0x79cc4519 ++ .word 0x79cc4519 ++ .word 0x9d8a7a87 ++ .word 0x9d8a7a87 ++ .word 0x9d8a7a87 ++ .word 0x9d8a7a87 ++ .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4 ++ +diff --git a/drv/hash_mb/sm3_mb_sve.S b/drv/hash_mb/sm3_mb_sve.S +new file mode 100644 +index 0000000..7dd2428 +--- /dev/null ++++ b/drv/hash_mb/sm3_mb_sve.S +@@ -0,0 +1,161 @@ ++/********************************************************************** ++ Copyright(c) 2022 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ .arch armv8.2-a+sve ++ ++.macro copy_mb_16words vecs:req,dest:req ++ mov src,\vecs ++ mov dst,\dest ++ mov ctr,lanes ++1: ++ ldr tmp,[src],8 ++ ldr tmp,[tmp] ++ add tmp,tmp,block_ctr,lsl 6 ++ ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp] ++ st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64 ++ subs ctr,ctr,1 ++ b.ne 1b ++.endm ++ ++.macro load_words windex:req ++ .if \windex == 0 ++ mov tmpw,16 ++ index VOFFS.s,0,tmpw ++ copy_mb_16words job_vec,databuf ++ mov dataptr,databuf ++ .endif ++ ld1w { WORD\windex\().s}, p0/z, [dataptr, VOFFS.s, UXTW 2] ++ add dataptr,dataptr,4 ++.endm ++ ++#include "sm3_sve_common.S" ++ ++/* int sm3_mb_sve_max_lanes() ++ * return : max lanes of SVE vector ++ */ ++ .global sm3_mb_sve_max_lanes ++ .type sm3_mb_sve_max_lanes, %function ++sm3_mb_sve_max_lanes: ++ cntw x0 ++ ret ++ .size sm3_mb_sve_max_lanes, .-sm3_mb_sve_max_lanes ++/* ++ * void sm3_mb_sve(int blocks, int total_lanes, SM3_JOB **job_vec) ++ */ ++ num_blocks .req w0 ++ total_lanes .req w1 ++ job_vec .req x2 ++ lanes .req x4 ++ src .req x5 ++ dst .req x6 ++ lane_offset .req w7 ++ lane_offset_x .req x7 ++ tmp .req x8 ++ tmpw .req w8 ++ block_ctr .req x9 ++ block_ctr_w .req w9 ++ savedsp .req x10 ++ databuf .req x11 ++ dataptr .req x12 ++ efgh_buf .req x12 ++ ctr .req x13 ++ abcd_buf .req x14 ++ sm3const_adr .req x15 ++ ++ .global sm3_mb_sve ++ .type sm3_mb_sve, %function ++sm3_mb_sve: ++ cbz num_blocks,.return ++ sm3_sve_save_stack ++ mov savedsp,sp ++ mov lane_offset, #0 ++ whilelo p0.s, wzr, total_lanes ++ // reserve (32 * max lanes) for abcdefgh buf ++ cntw tmp ++ lsl tmp, tmp, 5 ++ sub abcd_buf,sp,tmp ++ mov tmp,63 ++ bic abcd_buf,abcd_buf,tmp ++ // reserve (64 * lanes) for data buf ++ cntp lanes,p0,p0.s ++ lsl tmp,lanes,6 ++ sub databuf,abcd_buf,tmp ++ mov sp,databuf ++ adr sm3const_adr,SM3_CONSTS ++.seg_loops: ++ mov src,job_vec ++ mov dst,abcd_buf ++ cntp lanes,p0,p0.s ++ add efgh_buf,abcd_buf,lanes,lsl 4 ++ mov ctr,lanes ++.ldr_hash: ++ ldr tmp,[src],8 ++ add tmp,tmp,64 ++ ld1 {v0.16b, v1.16b},[tmp] ++ rev32 v0.16b,v0.16b ++ rev32 v1.16b,v1.16b ++ st1 {v0.16b},[dst],16 ++ st1 {v1.16b},[efgh_buf],16 ++ subs ctr,ctr,1 ++ bne .ldr_hash ++ ld4w {VA.s,VB.s,VC.s,VD.s},p0/z,[abcd_buf] ++ add tmp,abcd_buf,lanes,lsl 4 ++ ld4w {VE.s,VF.s,VG.s,VH.s},p0/z,[tmp] ++ mov block_ctr,0 ++ // always unpredicated SVE mode in current settings ++ pred_mode=0 ++.block_loop: ++ sm3_single ++ add block_ctr, block_ctr, 1 ++ cmp block_ctr_w,num_blocks ++ bne .block_loop ++ st4w {VA.s,VB.s,VC.s,VD.s},p0,[abcd_buf] ++ add efgh_buf,abcd_buf,lanes,lsl 4 ++ st4w {VE.s,VF.s,VG.s,VH.s},p0,[efgh_buf] ++ mov dst,job_vec ++ mov src,abcd_buf ++ add job_vec,job_vec,lanes,lsl 3 ++ mov ctr,lanes ++.str_hash: ++ ld1 {v0.16b},[src],16 ++ ld1 {v1.16b},[efgh_buf],16 ++ rev32 v0.16b,v0.16b ++ rev32 v1.16b,v1.16b ++ ldr tmp,[dst],8 ++ add tmp,tmp,64 ++ st1 {v0.16b,v1.16b},[tmp] ++ subs ctr,ctr,1 ++ bne .str_hash ++ incw lane_offset_x ++ whilelo p0.s, lane_offset, total_lanes ++ b.mi .seg_loops ++ mov sp,savedsp ++ sm3_sve_restore_stack ++.return: ++ ret ++ .size sm3_mb_sve, .-sm3_mb_sve +diff --git a/drv/hash_mb/sm3_sve_common.S b/drv/hash_mb/sm3_sve_common.S +new file mode 100644 +index 0000000..3d54952 +--- /dev/null ++++ b/drv/hash_mb/sm3_sve_common.S +@@ -0,0 +1,505 @@ ++/********************************************************************** ++ Copyright(c) 2022 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ VA .req z0 ++ VB .req z1 ++ VC .req z2 ++ VD .req z3 ++ VE .req z4 ++ VF .req z5 ++ VG .req z6 ++ VH .req z7 ++ TMPV0 .req v8 ++ TMPV1 .req v9 ++ TMPV2 .req v10 ++ TMPV3 .req v11 ++ WORD0 .req z8 ++ WORD1 .req z9 ++ WORD2 .req z10 ++ WORD3 .req z11 ++ WORD4 .req z12 ++ WORD5 .req z13 ++ WORD6 .req z14 ++ WORD7 .req z15 ++ WORD8 .req z16 ++ WORD9 .req z17 ++ WORD10 .req z18 ++ WORD11 .req z19 ++ WORD12 .req z20 ++ WORD13 .req z21 ++ WORD14 .req z22 ++ WORD15 .req z23 ++ WORD16 .req z24 ++ VOFFS .req z24 // reuse WORD16 ++ SS1 .req z25 ++ SS2 .req z26 ++ VT .req z26 // reuse SS2 ++ TT2 .req z27 ++ VT1 .req z28 ++ VT2 .req z29 ++ VT3 .req z30 ++ VT4 .req z31 ++ VZERO .req z31 ++ TT .req z0 ++ ++.macro sve_op inst:req,regd,args:vararg ++ .if pred_mode == 1 ++ \inst \regd,p0/m,\args ++ .else ++ \inst \regd,\args ++ .endif ++.endm ++ ++.macro sve_bitop inst:req,regd:req,regm:req ++ .if pred_mode == 1 ++ \inst \regd\().s,p0/m,\regd\().s,\regm\().s ++ .else ++ \inst \regd\().d,\regd\().d,\regm\().d ++ .endif ++.endm ++ ++.macro rotate_left0 out:req,in:req,tmp:req,bits:req,args:vararg ++ .if have_sve2 == 0 ++ lsl \tmp\().s,\in\().s,\bits ++ .else ++ movprfx \out\().d,\in\().d ++ xar \out\().s,\out\().s,VZERO.s,32-\bits ++ .endif ++ ++ .ifnb \args ++ rotate_left0 \args ++ .endif ++.endm ++ ++.macro rotate_left1 out:req,in:req,tmp:req,bits:req,args:vararg ++ .if have_sve2 == 0 ++ lsr \out\().s,\in\().s,32-\bits ++ .endif ++ ++ .ifnb \args ++ rotate_left1 \args ++ .endif ++.endm ++ ++.macro rotate_left2 out:req,in:req,tmp:req,bits:req,args:vararg ++ .if have_sve2 == 0 ++ orr \out\().d,\out\().d,\tmp\().d ++ .endif ++ ++ .ifnb \args ++ rotate_left2 \args ++ .endif ++.endm ++ ++.macro rotate_left args:vararg ++ rotate_left0 \args ++ rotate_left1 \args ++ rotate_left2 \args ++.endm ++ ++.macro SVE_EOR3 rd:req,r1:req,r2:req ++ .if have_sve2 == 0 ++ sve_bitop eor,\rd,\r1 ++ sve_bitop eor,\rd,\r2 ++ .else ++ eor3 \rd\().d,\rd\().d,\r1\().d,\r2\().d ++ .endif ++.endm ++ ++.macro FUNC_EOR3 ret:req,x:req,y:req,z:req ++ .if have_sve2 == 0 ++ eor \ret\().d,\x\().d,\y\().d ++ sve_bitop eor,\ret,\z ++ .else ++ movprfx \ret\().d,\x\().d ++ eor3 \ret\().d,\ret\().d,\y\().d,\z\().d ++ .endif ++.endm ++ ++.macro FUNC_FF windex:req,ret:req,x:req,y:req,z:req,tmp1:req,tmp2:req ++ and \ret\().d,\x\().d,\y\().d ++ and \tmp1\().d,\x\().d,\z\().d ++ and \tmp2\().d,\y\().d,\z\().d ++ sve_bitop orr,\ret,\tmp1 ++ sve_bitop orr,\ret,\tmp2 ++.endm ++ ++.macro FUNC_BSL ret:req,x:req,y:req,z:req,tmp:req ++ .if have_sve2 == 0 ++ bic \ret\().d,\z\().d,\x\().d ++ and \tmp\().d,\x\().d,\y\().d ++ sve_bitop orr,\ret,\tmp ++ .else ++ movprfx \ret\().d,\x\().d ++ bsl \ret\().d,\ret\().d,\y\().d,\z\().d ++ .endif ++.endm ++ ++.altmacro ++.macro load_next_words windex ++ .if \windex < 16 ++ load_words \windex ++ .endif ++.endm ++ ++.macro SM3_STEP_00_11 windex:req,w:req,w4:req ++ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) ++ ld1rw {VT2.s},p0/z,[sm3const_adr,\windex * 4] ++ rotate_left SS1,VA,VT1,12 ++ mov SS2.s,p0/m,SS1.s ++ sve_op add,SS1.s,SS1.s,VE.s ++ sve_op add,SS1.s,SS1.s,VT2.s ++ rotate_left SS1,SS1,VT2,7 ++ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] ++ add VT2.s,\w\().s,VH.s ++ FUNC_EOR3 TT2,VE,VF,VG ++ // SS2 = SS1 ^ rol32(a, 12) ++ sve_bitop eor,SS2,SS1 ++ sve_op add,TT2.s,TT2.s,VT2.s ++ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] ++ FUNC_EOR3 VH,VA,VB,VC ++ eor VT1.d,\w\().d,\w4\().d ++ sve_op add,VH.s,VH.s,VD.s ++ sve_op add,VH.s,VH.s,VT1.s ++ add VD.s,TT2.s,SS1.s ++ sve_op add,VH.s,VH.s,SS2.s ++ // d = P0(TT2) ++ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17 ++ SVE_EOR3 VD,VT1,VT3 ++ // b = rol32(b, 9) ++ // f = rol32(f, 19) ++ rotate_left VB,VB,VT3,9,VF,VF,VT4,19 ++.endm ++ ++.macro SM3_STEP_12_15 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req ++ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) ++ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12 ++ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4] ++ mov TT2.s,p0/m,SS1.s ++ sve_bitop eor,VT,\w16 ++ sve_op add,SS1.s,SS1.s,VE.s ++ sve_bitop eor,VT,\w9 ++ sve_op add,SS1.s,SS1.s,VT1.s ++ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23 ++ SVE_EOR3 VT,VT1,VT3 ++ rotate_left SS1,SS1,VT2,7 ++ sve_bitop eor,\w4,VT ++ // SS2 = SS1 ^ rol32(a, 12) ++ eor SS2.d,TT2.d,SS1.d ++ sve_bitop eor,\w4,\w6 ++ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] ++ FUNC_EOR3 TT2,VE,VF,VG ++ add VT1.s,\w\().s,VH.s ++ sve_op add,TT2.s,TT2.s,VT1.s ++ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] ++ FUNC_EOR3 VH,VA,VB,VC ++ eor VT1.d,\w\().d,\w4\().d ++ sve_op add,VH.s,VH.s,VD.s ++ // b = rol32(b, 9) ++ // f = rol32(f, 19) ++ rotate_left VB,VB,VT3,9 ++ sve_op add,VH.s,VH.s,VT1.s ++ add VD.s,TT2.s,SS1.s ++ sve_op add,VH.s,VH.s,SS2.s ++ // d = P0(TT2) ++ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17,VF,VF,TT2,19 ++ SVE_EOR3 VD,VT1,VT3 ++.endm ++ ++.macro SM3_STEP_16_62 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req ++ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) ++ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12 ++ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4] ++ mov TT2.s,p0/m,SS1.s ++ sve_bitop eor,VT,\w16 ++ sve_op add,SS1.s,SS1.s,VE.s ++ sve_bitop eor,VT,\w9 ++ sve_op add,SS1.s,SS1.s,VT1.s ++ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23 ++ SVE_EOR3 \w4,VT,VT1 ++ rotate_left SS1,SS1,VT2,7 ++ sve_bitop eor,\w4,VT3 ++ // SS2 = SS1 ^ rol32(a, 12) ++ eor SS2.d,TT2.d,SS1.d ++ sve_bitop eor,\w4,\w6 ++ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] ++ sve_op add,SS1.s,SS1.s,\w\().s ++ FUNC_BSL TT2,VE,VF,VG,VT1 ++ sve_op add,SS1.s,SS1.s,VH.s ++ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] ++ FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2 ++ eor VT1.d,\w\().d,\w4\().d ++ sve_op add,VH.s,VH.s,VD.s ++ // b = rol32(b, 9) ++ // f = rol32(f, 19) ++ rotate_left VB,VB,VT2,9,VF,VF,VT4,19 ++ sve_op add,VH.s,VH.s,VT1.s ++ add VD.s,TT2.s,SS1.s ++ sve_op add,VH.s,VH.s,SS2.s ++ // d = P0(TT2) ++ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17 ++ SVE_EOR3 VD,VT1,VT3 ++.endm ++ ++.macro SM3_STEP_63 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req ++ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) ++ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12 ++ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4] ++ mov TT2.s,p0/m,SS1.s ++ sve_bitop eor,VT,\w16 ++ sve_op add,SS1.s,SS1.s,VE.s ++ sve_bitop eor,VT,\w9 ++ sve_op add,SS1.s,SS1.s,VT1.s ++ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23 ++ SVE_EOR3 VT,VT1,VT3 ++ rotate_left SS1,SS1,VT2,7 ++ sve_bitop eor,\w4,VT ++ // SS2 = SS1 ^ rol32(a, 12) ++ eor SS2.d,TT2.d,SS1.d ++ sve_bitop eor,\w4,\w6 ++ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] ++ FUNC_BSL TT2,VE,VF,VG,VT1 ++ add VT1.s,\w\().s,VH.s ++ .if \windex == 63 ++ ld1w {WORD0.s},p0/z,[abcd_buf, 0, MUL VL] ++ ld1w {WORD1.s},p0/z,[abcd_buf, 1, MUL VL] ++ ld1w {WORD2.s},p0/z,[abcd_buf, 2, MUL VL] ++ ld1w {WORD3.s},p0/z,[abcd_buf, 3, MUL VL] ++ ld1w {WORD4.s},p0/z,[abcd_buf, 4, MUL VL] ++ ld1w {WORD5.s},p0/z,[abcd_buf, 5, MUL VL] ++ ld1w {WORD6.s},p0/z,[abcd_buf, 6, MUL VL] ++ ld1w {WORD7.s},p0/z,[abcd_buf, 7, MUL VL] ++ .endif ++ sve_op add,TT2.s,TT2.s,VT1.s ++ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] ++ FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2 ++ eor VT1.d,\w\().d,\w4\().d ++ sve_op add,VH.s,VH.s,VD.s ++ // b = rol32(b, 9) ++ // f = rol32(f, 19) ++ rotate_left VB,VB,VT2,9,VF,VF,VT4,19 ++ sve_op add,VH.s,VH.s,VT1.s ++ add VD.s,TT2.s,SS1.s ++ sve_bitop eor,VA,WORD1 ++ sve_bitop eor,VB,WORD2 ++ sve_bitop eor,VC,WORD3 ++ // d = P0(TT2) ++ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17 ++ sve_bitop eor,VF,WORD6 ++ SVE_EOR3 VD,VT1,VT3 ++ sve_bitop eor,VG,WORD7 ++ sve_bitop eor,VD,WORD4 ++ sve_op add,VH.s,VH.s,SS2.s ++ sve_bitop eor,VE,WORD5 ++ sve_bitop eor,VH,WORD0 ++.endm ++ ++.macro SWAP_STATES ++ .unreq TT ++ TT .req VH ++ .unreq VH ++ VH .req VG ++ .unreq VG ++ VG .req VF ++ .unreq VF ++ VF .req VE ++ .unreq VE ++ VE .req VD ++ .unreq VD ++ VD .req VC ++ .unreq VC ++ VC .req VB ++ .unreq VB ++ VB .req VA ++ .unreq VA ++ VA .req TT ++.endm ++ ++.altmacro ++.macro SM3_STEP_WRAPPER windex:req,idx:req,idx4:req,idx16,idx13,idx9,idx6,idx3 ++ .if \windex <= 11 ++ revb WORD\idx4\().s, p0/m, WORD\idx4\().s ++ next=\idx4+1 ++ load_next_words %next ++ SM3_STEP_00_11 \windex,WORD\idx\(),WORD\idx4\() ++ .else ++ .if \windex < 16 ++ SM3_STEP_12_15 \windex,WORD\idx\(),\ ++ WORD\idx4\(),WORD\idx16\(),WORD\idx13\(),\ ++ WORD\idx9\(),WORD\idx6\(),WORD\idx3\() ++ .else ++ .if \windex == 63 ++ SM3_STEP_63 \windex,WORD\idx\(),WORD\idx4\(),\ ++ WORD\idx16\(),WORD\idx13\(),WORD\idx9\(),\ ++ WORD\idx6\(),WORD\idx3\() ++ .else ++ SM3_STEP_16_62 \windex,WORD\idx\(),WORD\idx4\(),\ ++ WORD\idx16\(),WORD\idx13\(),WORD\idx9\(),\ ++ WORD\idx6\(),WORD\idx3\() ++ .endif ++ .endif ++ .endif ++.endm ++ ++.macro exec_step windex:req ++ .if \windex <= 11 ++ idx4=\windex+4 ++ SM3_STEP_WRAPPER \windex,\windex,%idx4 ++ .else ++ idxp4=\windex + 4 ++ idx4=idxp4 % 17 ++ idx16=(idxp4 - 16) % 17 ++ idx13=(idxp4 - 13) % 17 ++ idx9=(idxp4 - 9) % 17 ++ idx6=(idxp4 - 6) % 17 ++ idx3=(idxp4 - 3) % 17 ++ idx=\windex % 17 ++ SM3_STEP_WRAPPER \windex,%idx,%idx4,%idx16,%idx13,%idx9,%idx6,%idx3 ++ .endif ++ SWAP_STATES ++.endm ++ ++.macro sm3_exec ++ current_step=0 ++ .rept 64 ++ exec_step %current_step ++ current_step=current_step+1 ++ .endr ++.endm ++ ++.macro sm3_single sve2:vararg ++ .ifnb \sve2 ++ have_sve2 = 1 ++ .else ++ have_sve2=0 ++ .endif ++ st1w {VA.s},p0,[abcd_buf, 0, MUL VL] ++ st1w {VB.s},p0,[abcd_buf, 1, MUL VL] ++ st1w {VC.s},p0,[abcd_buf, 2, MUL VL] ++ st1w {VD.s},p0,[abcd_buf, 3, MUL VL] ++ st1w {VE.s},p0,[abcd_buf, 4, MUL VL] ++ st1w {VF.s},p0,[abcd_buf, 5, MUL VL] ++ st1w {VG.s},p0,[abcd_buf, 6, MUL VL] ++ st1w {VH.s},p0,[abcd_buf, 7, MUL VL] ++ load_words 0 ++ load_words 1 ++ load_words 2 ++ load_words 3 ++ load_words 4 ++ revb WORD0.s, p0/m, WORD0.s ++ revb WORD1.s, p0/m, WORD1.s ++ revb WORD2.s, p0/m, WORD2.s ++ revb WORD3.s, p0/m, WORD3.s ++ .if have_sve2 == 1 ++ mov VZERO.s,p0/m,#0 ++ .endif ++ sm3_exec ++.endm ++ ++.macro sm3_sve_save_stack ++ stp d8,d9,[sp, -64]! ++ stp d10,d11,[sp, 16] ++ stp d12,d13,[sp, 32] ++ stp d14,d15,[sp, 48] ++.endm ++ ++.macro sm3_sve_restore_stack ++ ldp d10,d11,[sp, 16] ++ ldp d12,d13,[sp, 32] ++ ldp d14,d15,[sp, 48] ++ ldp d8,d9,[sp],64 ++.endm ++ ++ .section .rodata.cst16,"aM",@progbits,16 ++ .align 16 ++SM3_CONSTS: ++ .word 0x79CC4519 ++ .word 0xF3988A32 ++ .word 0xE7311465 ++ .word 0xCE6228CB ++ .word 0x9CC45197 ++ .word 0x3988A32F ++ .word 0x7311465E ++ .word 0xE6228CBC ++ .word 0xCC451979 ++ .word 0x988A32F3 ++ .word 0x311465E7 ++ .word 0x6228CBCE ++ .word 0xC451979C ++ .word 0x88A32F39 ++ .word 0x11465E73 ++ .word 0x228CBCE6 ++ .word 0x9D8A7A87 ++ .word 0x3B14F50F ++ .word 0x7629EA1E ++ .word 0xEC53D43C ++ .word 0xD8A7A879 ++ .word 0xB14F50F3 ++ .word 0x629EA1E7 ++ .word 0xC53D43CE ++ .word 0x8A7A879D ++ .word 0x14F50F3B ++ .word 0x29EA1E76 ++ .word 0x53D43CEC ++ .word 0xA7A879D8 ++ .word 0x4F50F3B1 ++ .word 0x9EA1E762 ++ .word 0x3D43CEC5 ++ .word 0x7A879D8A ++ .word 0xF50F3B14 ++ .word 0xEA1E7629 ++ .word 0xD43CEC53 ++ .word 0xA879D8A7 ++ .word 0x50F3B14F ++ .word 0xA1E7629E ++ .word 0x43CEC53D ++ .word 0x879D8A7A ++ .word 0x0F3B14F5 ++ .word 0x1E7629EA ++ .word 0x3CEC53D4 ++ .word 0x79D8A7A8 ++ .word 0xF3B14F50 ++ .word 0xE7629EA1 ++ .word 0xCEC53D43 ++ .word 0x9D8A7A87 ++ .word 0x3B14F50F ++ .word 0x7629EA1E ++ .word 0xEC53D43C ++ .word 0xD8A7A879 ++ .word 0xB14F50F3 ++ .word 0x629EA1E7 ++ .word 0xC53D43CE ++ .word 0x8A7A879D ++ .word 0x14F50F3B ++ .word 0x29EA1E76 ++ .word 0x53D43CEC ++ .word 0xA7A879D8 ++ .word 0x4F50F3B1 ++ .word 0x9EA1E762 ++ .word 0x3D43CEC5 ++ +-- +2.25.1 + diff --git a/0033-uadk_tool-fix-aead-performance-test-issue.patch b/0033-uadk_tool-fix-aead-performance-test-issue.patch new file mode 100644 index 0000000..c422378 --- /dev/null +++ b/0033-uadk_tool-fix-aead-performance-test-issue.patch @@ -0,0 +1,38 @@ +From f5787232f4f5cb09445bfc87d20cb2b43f5e5ea3 Mon Sep 17 00:00:00 2001 +From: Longfang Liu +Date: Mon, 11 Mar 2024 16:14:34 +0800 +Subject: [PATCH 33/44] uadk_tool: fix aead performance test issue + +In the current UADK code, due to the new support for aead stream mode, +a new msg_state state has been added. If the initial value is not +assigned, an error will occur in the block mode check. +As a result, the performance test cannot be executed. + +Signed-off-by: Longfang Liu +--- + uadk_tool/benchmark/sec_uadk_benchmark.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/uadk_tool/benchmark/sec_uadk_benchmark.c b/uadk_tool/benchmark/sec_uadk_benchmark.c +index c99ae89..92e967a 100644 +--- a/uadk_tool/benchmark/sec_uadk_benchmark.c ++++ b/uadk_tool/benchmark/sec_uadk_benchmark.c +@@ -1165,6 +1165,7 @@ static void *sec_uadk_aead_async(void *arg) + areq.mac_bytes = auth_size; + areq.assoc_bytes = SEC_AEAD_LEN; + areq.in_bytes = g_pktlen; ++ areq.msg_state = 0; + if (pdata->is_union) + areq.mac_bytes = 32; + if (areq.op_type) // decrypto +@@ -1396,6 +1397,7 @@ static void *sec_uadk_aead_sync(void *arg) + areq.assoc_bytes = SEC_AEAD_LEN; + areq.in_bytes = g_pktlen; + areq.mac_bytes = g_maclen; ++ areq.msg_state = 0; + if (areq.op_type) // decrypto + areq.out_bytes = g_pktlen + 16; // aadsize = 16; + else +-- +2.25.1 + diff --git a/0034-uadk_tool-fix-the-logic-for-counting-retransmissions.patch b/0034-uadk_tool-fix-the-logic-for-counting-retransmissions.patch new file mode 100644 index 0000000..98c833d --- /dev/null +++ b/0034-uadk_tool-fix-the-logic-for-counting-retransmissions.patch @@ -0,0 +1,100 @@ +From 12466753e870b9577826d238e08a744c17267a8f Mon Sep 17 00:00:00 2001 +From: Chenghai Huang +Date: Mon, 11 Mar 2024 16:19:46 +0800 +Subject: [PATCH 34/44] uadk_tool: fix the logic for counting retransmissions + when busy + +The try cnt should be set to 0 after the packet is successfully +sent, not after BUSY is returned. + +Signed-off-by: Chenghai Huang +--- + uadk_tool/benchmark/zip_uadk_benchmark.c | 8 ++++---- + uadk_tool/benchmark/zip_wd_benchmark.c | 7 +++---- + 2 files changed, 7 insertions(+), 8 deletions(-) + +diff --git a/uadk_tool/benchmark/zip_uadk_benchmark.c b/uadk_tool/benchmark/zip_uadk_benchmark.c +index 435c0b4..9681c22 100644 +--- a/uadk_tool/benchmark/zip_uadk_benchmark.c ++++ b/uadk_tool/benchmark/zip_uadk_benchmark.c +@@ -817,9 +817,8 @@ static void *zip_uadk_blk_lz77_async_run(void *arg) + + while(1) { + if (get_run_state() == 0) +- break; ++ break; + +- try_cnt = 0; + i = count % MAX_POOL_LENTH_COMP; + creq.src = uadk_pool->bds[i].src; + creq.dst = &hw_buff_out[i]; //temp out +@@ -845,6 +844,7 @@ static void *zip_uadk_blk_lz77_async_run(void *arg) + } else if (ret || creq.status) { + break; + } ++ try_cnt = 0; + count++; + } + +@@ -1037,9 +1037,8 @@ static void *zip_uadk_blk_async_run(void *arg) + + while(1) { + if (get_run_state() == 0) +- break; ++ break; + +- try_cnt = 0; + i = count % MAX_POOL_LENTH_COMP; + creq.src = uadk_pool->bds[i].src; + creq.dst = uadk_pool->bds[i].dst; +@@ -1062,6 +1061,7 @@ static void *zip_uadk_blk_async_run(void *arg) + } else if (ret || creq.status) { + break; + } ++ try_cnt = 0; + count++; + } + +diff --git a/uadk_tool/benchmark/zip_wd_benchmark.c b/uadk_tool/benchmark/zip_wd_benchmark.c +index d2340e0..d7bafd6 100644 +--- a/uadk_tool/benchmark/zip_wd_benchmark.c ++++ b/uadk_tool/benchmark/zip_wd_benchmark.c +@@ -708,9 +708,8 @@ static void *zip_wd_blk_lz77_async_run(void *arg) + + while(1) { + if (get_run_state() == 0) +- break; ++ break; + +- try_cnt = 0; + i = count % MAX_POOL_LENTH_COMP; + opdata.in = bd_pool[i].src; + opdata.out = bd_pool[i].dst; //temp out +@@ -737,6 +736,7 @@ static void *zip_wd_blk_lz77_async_run(void *arg) + opdata.status == WD_IN_EPARA || opdata.status == WD_VERIFY_ERR) { + break; + } ++ try_cnt = 0; + count++; + } + +@@ -984,7 +984,6 @@ static void *zip_wd_blk_async_run(void *arg) + opdata.in_len = bd_pool[i].src_len; + opdata.avail_out = out_len; + +- try_cnt = 0; + tag[i].ctx = ctx; + tag[i].td_id = pdata->td_id; + tag[i].bd_idx = i; +@@ -1002,7 +1001,7 @@ static void *zip_wd_blk_async_run(void *arg) + opdata.status == WD_IN_EPARA || opdata.status == WD_VERIFY_ERR) { + break; + } +- ++ try_cnt = 0; + count++; + } + +-- +2.25.1 + diff --git a/0035-uadk-tools-support-the-nosva-test-of-a-specified-dev.patch b/0035-uadk-tools-support-the-nosva-test-of-a-specified-dev.patch new file mode 100644 index 0000000..a816149 --- /dev/null +++ b/0035-uadk-tools-support-the-nosva-test-of-a-specified-dev.patch @@ -0,0 +1,119 @@ +From 17e5f25df480a5cacc3ac5e8ae88b708786eec44 Mon Sep 17 00:00:00 2001 +From: Chenghai Huang +Date: Mon, 11 Mar 2024 16:22:58 +0800 +Subject: [PATCH 35/44] uadk/tools - support the nosva test of a specified + device + +Add the description of device parameters. The input should +use the device name from '/sys/class/uacce/'. Only full +matching device names are supported. + +Signed-off-by: Chenghai Huang +--- + uadk_tool/benchmark/hpre_wd_benchmark.c | 8 ++++++++ + uadk_tool/benchmark/sec_wd_benchmark.c | 8 ++++++++ + uadk_tool/benchmark/trng_wd_benchmark.c | 8 ++++++++ + uadk_tool/benchmark/uadk_benchmark.c | 3 +++ + uadk_tool/benchmark/zip_wd_benchmark.c | 8 ++++++++ + 5 files changed, 35 insertions(+) + +diff --git a/uadk_tool/benchmark/hpre_wd_benchmark.c b/uadk_tool/benchmark/hpre_wd_benchmark.c +index 2873ffd..6dc1269 100644 +--- a/uadk_tool/benchmark/hpre_wd_benchmark.c ++++ b/uadk_tool/benchmark/hpre_wd_benchmark.c +@@ -431,6 +431,14 @@ static int init_hpre_wd_queue(struct acc_option *options) + /* nodemask need to be clean */ + g_thread_queue.bd_res[i].queue->node_mask = 0x0; + memset(g_thread_queue.bd_res[i].queue->dev_path, 0x0, PATH_STR_SIZE); ++ if (strlen(options->device) != 0) { ++ ret = snprintf(g_thread_queue.bd_res[i].queue->dev_path, ++ PATH_STR_SIZE, "%s", options->device); ++ if (ret < 0) { ++ WD_ERR("failed to copy dev file path!\n"); ++ return -WD_EINVAL; ++ } ++ } + + ret = wd_request_queue(g_thread_queue.bd_res[i].queue); + if (ret) { +diff --git a/uadk_tool/benchmark/sec_wd_benchmark.c b/uadk_tool/benchmark/sec_wd_benchmark.c +index aa03db8..2ed8493 100644 +--- a/uadk_tool/benchmark/sec_wd_benchmark.c ++++ b/uadk_tool/benchmark/sec_wd_benchmark.c +@@ -600,6 +600,14 @@ static int init_wd_queue(struct acc_option *options) + /* nodemask need to be clean */ + g_thread_queue.bd_res[i].queue->node_mask = 0x0; + memset(g_thread_queue.bd_res[i].queue->dev_path, 0x0, PATH_STR_SIZE); ++ if (strlen(options->device) != 0) { ++ ret = snprintf(g_thread_queue.bd_res[i].queue->dev_path, ++ PATH_STR_SIZE, "%s", options->device); ++ if (ret < 0) { ++ WD_ERR("failed to copy dev file path!\n"); ++ return -WD_EINVAL; ++ } ++ } + + ret = wd_request_queue(g_thread_queue.bd_res[i].queue); + if (ret) { +diff --git a/uadk_tool/benchmark/trng_wd_benchmark.c b/uadk_tool/benchmark/trng_wd_benchmark.c +index 64942f0..3ce329a 100644 +--- a/uadk_tool/benchmark/trng_wd_benchmark.c ++++ b/uadk_tool/benchmark/trng_wd_benchmark.c +@@ -51,6 +51,14 @@ static int init_trng_wd_queue(struct acc_option *options) + /* nodemask need to be clean */ + g_thread_queue.bd_res[i].queue->node_mask = 0x0; + memset(g_thread_queue.bd_res[i].queue->dev_path, 0x0, PATH_STR_SIZE); ++ if (strlen(options->device) != 0) { ++ ret = snprintf(g_thread_queue.bd_res[i].queue->dev_path, ++ PATH_STR_SIZE, "%s", options->device); ++ if (ret < 0) { ++ WD_ERR("failed to copy dev file path!\n"); ++ return -WD_EINVAL; ++ } ++ } + + g_thread_queue.bd_res[i].in_bytes = options->pktlen; + g_thread_queue.bd_res[i].out = malloc(options->pktlen); +diff --git a/uadk_tool/benchmark/uadk_benchmark.c b/uadk_tool/benchmark/uadk_benchmark.c +index cf3a93c..0ebbb68 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.c ++++ b/uadk_tool/benchmark/uadk_benchmark.c +@@ -595,6 +595,7 @@ static void dump_param(struct acc_option *option) + ACC_TST_PRT(" [--engine]: %s\n", option->engine); + ACC_TST_PRT(" [--latency]: %u\n", option->latency); + ACC_TST_PRT(" [--init2]: %u\n", option->inittype); ++ ACC_TST_PRT(" [--device]: %s\n", option->device); + } + + int acc_benchmark_run(struct acc_option *option) +@@ -718,6 +719,8 @@ static void print_help(void) + ACC_TST_PRT(" test the running time of packets\n"); + ACC_TST_PRT(" [--init2]:\n"); + ACC_TST_PRT(" select init2 mode in the init interface of UADK SVA\n"); ++ ACC_TST_PRT(" [--device]:\n"); ++ ACC_TST_PRT(" select device to do task\n"); + ACC_TST_PRT(" [--help] = usage\n"); + ACC_TST_PRT("Example\n"); + ACC_TST_PRT(" ./uadk_tool benchmark --alg aes-128-cbc --mode sva --opt 0 --sync\n"); +diff --git a/uadk_tool/benchmark/zip_wd_benchmark.c b/uadk_tool/benchmark/zip_wd_benchmark.c +index d7bafd6..4424e08 100644 +--- a/uadk_tool/benchmark/zip_wd_benchmark.c ++++ b/uadk_tool/benchmark/zip_wd_benchmark.c +@@ -310,6 +310,14 @@ static int init_zip_wd_queue(struct acc_option *options) + /* nodemask need to be clean */ + g_thread_queue.bd_res[i].queue->node_mask = 0x0; + memset(g_thread_queue.bd_res[i].queue->dev_path, 0x0, PATH_STR_SIZE); ++ if (strlen(options->device) != 0) { ++ ret = snprintf(g_thread_queue.bd_res[i].queue->dev_path, ++ PATH_STR_SIZE, "%s", options->device); ++ if (ret < 0) { ++ WD_ERR("failed to copy dev file path!\n"); ++ return -WD_EINVAL; ++ } ++ } + + ret = wd_request_queue(g_thread_queue.bd_res[i].queue); + if (ret) { +-- +2.25.1 + diff --git a/0036-uadk-tools-support-designated-device-testing.patch b/0036-uadk-tools-support-designated-device-testing.patch new file mode 100644 index 0000000..7334716 --- /dev/null +++ b/0036-uadk-tools-support-designated-device-testing.patch @@ -0,0 +1,674 @@ +From 829bc553310349ee7c654397204e8b348d7610f4 Mon Sep 17 00:00:00 2001 +From: Yang Shen +Date: Mon, 11 Mar 2024 16:27:04 +0800 +Subject: [PATCH 36/44] uadk/tools - support designated device testing + +Add a parameter 'device' to designate a device. The input should +use the device name from '/sys/class/uacce/'. Only full +matching device names are supported. + +Signed-off-by: Yang Shen +--- + uadk_tool/benchmark/hpre_uadk_benchmark.c | 86 ++++++++++++++------- + uadk_tool/benchmark/sec_uadk_benchmark.c | 86 ++++++++++++++------- + uadk_tool/benchmark/uadk_benchmark.c | 64 ++++++++------- + uadk_tool/benchmark/uadk_benchmark.h | 36 +++++---- + uadk_tool/benchmark/zip_uadk_benchmark.c | 94 ++++++++++++++++------- + 5 files changed, 237 insertions(+), 129 deletions(-) + +diff --git a/uadk_tool/benchmark/hpre_uadk_benchmark.c b/uadk_tool/benchmark/hpre_uadk_benchmark.c +index 0cbbdf2..729728f 100644 +--- a/uadk_tool/benchmark/hpre_uadk_benchmark.c ++++ b/uadk_tool/benchmark/hpre_uadk_benchmark.c +@@ -346,43 +346,66 @@ static int hpre_uadk_param_parse(thread_data *tddata, struct acc_option *options + + static int init_hpre_ctx_config(struct acc_option *options) + { ++ struct uacce_dev_list *list, *tmp; + int subtype = options->subtype; + char *alg = options->algclass; + int mode = options->syncmode; ++ struct uacce_dev *dev = NULL; + struct sched_params param; +- struct uacce_dev *dev; +- int max_node; ++ int max_node, i; ++ char *dev_name; + int ret = 0; +- int i = 0; + + max_node = numa_max_node() + 1; + if (max_node <= 0) + return -EINVAL; + +- memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); +- g_ctx_cfg.ctx_num = g_ctxnum; +- g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); +- if (!g_ctx_cfg.ctxs) +- return -ENOMEM; ++ list = wd_get_accel_list(alg); ++ if (!list) { ++ HPRE_TST_PRT("failed to get %s device\n", alg); ++ return -ENODEV; ++ } + +- while (i < g_ctxnum) { +- dev = wd_get_accel_dev(alg); +- if (!dev) { +- HPRE_TST_PRT("failed to get %s device\n", alg); +- ret = -EINVAL; +- goto out; ++ if (strlen(options->device) == 0) { ++ dev = list->dev; ++ } else { ++ for (tmp = list; tmp; tmp = tmp->next) { ++ dev_name = strrchr(tmp->dev->dev_root, '/') + 1; ++ if (!strcmp(dev_name, options->device)) { ++ dev = tmp->dev; ++ break; ++ } + } ++ } + +- for (; i < g_ctxnum; i++) { +- g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); +- if (!g_ctx_cfg.ctxs[i].ctx) +- break; ++ if (dev == NULL) { ++ HPRE_TST_PRT("failed to find device %s\n", options->device); ++ ret = -ENODEV; ++ goto free_list; ++ } ++ ++ /* If there is no numa, we defualt config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; + +- g_ctx_cfg.ctxs[i].op_type = 0; // default op_type +- g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; ++ memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); ++ g_ctx_cfg.ctx_num = g_ctxnum; ++ g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); ++ if (!g_ctx_cfg.ctxs) { ++ ret = -ENOMEM; ++ goto free_list; ++ } ++ ++ for (i = 0; i < g_ctxnum; i++) { ++ g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); ++ if (!g_ctx_cfg.ctxs[i].ctx) { ++ HPRE_TST_PRT("failed to alloc %dth ctx\n", i); ++ ret = -ENODEV; ++ goto free_ctx; + } + +- free(dev); ++ g_ctx_cfg.ctxs[i].op_type = 0; ++ g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; + } + + switch(subtype) { +@@ -401,11 +424,11 @@ static int init_hpre_ctx_config(struct acc_option *options) + break; + default: + HPRE_TST_PRT("failed to parse alg subtype!\n"); +- return -EINVAL; ++ goto free_ctx; + } + if (!g_sched) { + HPRE_TST_PRT("failed to alloc sched!\n"); +- goto out; ++ goto free_ctx; + } + + g_sched->name = SCHED_SINGLE; +@@ -417,7 +440,7 @@ static int init_hpre_ctx_config(struct acc_option *options) + ret = wd_sched_rr_instance(g_sched, ¶m); + if (ret) { + HPRE_TST_PRT("failed to fill hpre sched data!\n"); +- goto out; ++ goto free_sched; + } + + /* init */ +@@ -438,17 +461,22 @@ static int init_hpre_ctx_config(struct acc_option *options) + } + if (ret) { + HPRE_TST_PRT("failed to get hpre ctx!\n"); +- goto out; ++ goto free_sched; + } + + return 0; +-out: +- for (i = i - 1; i >= 0; i--) +- wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); + +- free(g_ctx_cfg.ctxs); ++free_sched: + wd_sched_rr_release(g_sched); + ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ free(g_ctx_cfg.ctxs); ++ ++free_list: ++ wd_free_list_accels(list); ++ + return ret; + } + +diff --git a/uadk_tool/benchmark/sec_uadk_benchmark.c b/uadk_tool/benchmark/sec_uadk_benchmark.c +index 92e967a..105fb1a 100644 +--- a/uadk_tool/benchmark/sec_uadk_benchmark.c ++++ b/uadk_tool/benchmark/sec_uadk_benchmark.c +@@ -516,42 +516,66 @@ static int sec_uadk_param_parse(thread_data *tddata, struct acc_option *options) + + static int init_ctx_config(struct acc_option *options) + { ++ struct uacce_dev_list *list, *tmp; + struct sched_params param = {0}; +- struct uacce_dev *dev = NULL; +- char *alg = options->algclass; + int subtype = options->subtype; ++ char *alg = options->algclass; + int mode = options->syncmode; +- int max_node = 0; ++ struct uacce_dev *dev = NULL; ++ int max_node, i; ++ char *dev_name; + int ret = 0; +- int i = 0; + + max_node = numa_max_node() + 1; + if (max_node <= 0) + return -EINVAL; + +- memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); +- g_ctx_cfg.ctx_num = g_ctxnum; +- g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); +- if (!g_ctx_cfg.ctxs) +- return -ENOMEM; ++ list = wd_get_accel_list(alg); ++ if (!list) { ++ SEC_TST_PRT("failed to get %s device\n", alg); ++ return -ENODEV; ++ } + +- while (i < g_ctxnum) { +- dev = wd_get_accel_dev(alg); +- if (!dev) { +- SEC_TST_PRT("failed to get %s device\n", alg); +- goto out; ++ if (strlen(options->device) == 0) { ++ dev = list->dev; ++ } else { ++ for (tmp = list; tmp; tmp = tmp->next) { ++ dev_name = strrchr(tmp->dev->dev_root, '/') + 1; ++ if (!strcmp(dev_name, options->device)) { ++ dev = tmp->dev; ++ break; ++ } + } ++ } + +- for (; i < g_ctxnum; i++) { +- g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); +- if (!g_ctx_cfg.ctxs[i].ctx) +- break; ++ if (dev == NULL) { ++ SEC_TST_PRT("failed to find device %s\n", options->device); ++ ret = -ENODEV; ++ goto free_list; ++ } ++ ++ /* If there is no numa, we defualt config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ ++ memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); ++ g_ctx_cfg.ctx_num = g_ctxnum; ++ g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); ++ if (!g_ctx_cfg.ctxs) { ++ ret = -ENOMEM; ++ goto free_list; ++ } + +- g_ctx_cfg.ctxs[i].op_type = 0; // default op_type +- g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; ++ for (i = 0; i < g_ctxnum; i++) { ++ g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); ++ if (!g_ctx_cfg.ctxs[i].ctx) { ++ SEC_TST_PRT("failed to alloc %dth ctx\n", i); ++ ret = -ENOMEM; ++ goto free_ctx; + } + +- free(dev); ++ g_ctx_cfg.ctxs[i].op_type = 0; ++ g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; + } + + switch(subtype) { +@@ -566,11 +590,11 @@ static int init_ctx_config(struct acc_option *options) + break; + default: + SEC_TST_PRT("failed to parse alg subtype!\n"); +- return -EINVAL; ++ goto free_ctx; + } + if (!g_sched) { + SEC_TST_PRT("failed to alloc sched!\n"); +- goto out; ++ goto free_ctx; + } + + g_sched->name = SCHED_SINGLE; +@@ -582,7 +606,7 @@ static int init_ctx_config(struct acc_option *options) + ret = wd_sched_rr_instance(g_sched, ¶m); + if (ret) { + SEC_TST_PRT("failed to fill sched data!\n"); +- goto out; ++ goto free_sched; + } + + /* init */ +@@ -599,17 +623,21 @@ static int init_ctx_config(struct acc_option *options) + } + if (ret) { + SEC_TST_PRT("failed to cipher ctx!\n"); +- goto out; ++ goto free_sched; + } + + return 0; + +-out: +- for (i--; i >= 0; i--) +- wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++free_sched: ++ wd_sched_rr_release(g_sched); + ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); + free(g_ctx_cfg.ctxs); +- wd_sched_rr_release(g_sched); ++ ++free_list: ++ wd_free_list_accels(list); + + return ret; + } +diff --git a/uadk_tool/benchmark/uadk_benchmark.c b/uadk_tool/benchmark/uadk_benchmark.c +index 0ebbb68..5dbe26a 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.c ++++ b/uadk_tool/benchmark/uadk_benchmark.c +@@ -491,6 +491,7 @@ static void parse_alg_param(struct acc_option *option) + void cal_perfermance_data(struct acc_option *option, u32 sttime) + { + u8 palgname[MAX_ALG_NAME]; ++ char *unit = "KiB/s"; + double perfermance; + double cpu_rate; + u32 ttime = 1000; +@@ -506,8 +507,8 @@ void cal_perfermance_data(struct acc_option *option, u32 sttime) + if (option->syncmode == SYNC_MODE) { + if (get_recv_time() == option->threads) + break; +- } else { // ASYNC_MODE +- if (get_recv_time() == 1) // poll complete ++ } else { ++ if (get_recv_time() == 1) + break; + } + usleep(1000); +@@ -525,14 +526,17 @@ void cal_perfermance_data(struct acc_option *option, u32 sttime) + palgname[i] = '\0'; + + ptime = ptime - sttime; ++ cpu_rate = (double)ptime / option->times; ++ + perfdata = g_recv_data.pkg_len * g_recv_data.recv_cnt / 1024.0; +- perfops = (double)(g_recv_data.recv_cnt) / 1000.0; + perfermance = perfdata / option->times; ++ ++ perfops = g_recv_data.recv_cnt / 1000.0; + ops = perfops / option->times; +- cpu_rate = (double)ptime / option->times; +- ACC_TST_PRT("algname: length: perf: iops: CPU_rate:\n" +- "%s %-2uBytes %.1fKB/s %.1fKops %.2f%%\n", +- palgname, option->pktlen, perfermance, ops, cpu_rate); ++ ++ ACC_TST_PRT("algname:\tlength:\t\tperf:\t\tiops:\t\tCPU_rate:\n" ++ "%s\t%-2uBytes \t%.2f%s\t%.1fKops \t%.2f%%\n", ++ palgname, option->pktlen, perfermance, unit, ops, cpu_rate); + } + + static int benchmark_run(struct acc_option *option) +@@ -744,24 +748,25 @@ int acc_cmd_parse(int argc, char *argv[], struct acc_option *option) + int c; + + static struct option long_options[] = { +- {"help", no_argument, 0, 0}, +- {"alg", required_argument, 0, 1}, +- {"mode", required_argument, 0, 2}, +- {"opt", required_argument, 0, 3}, +- {"sync", no_argument, 0, 4}, +- {"async", no_argument, 0, 5}, +- {"pktlen", required_argument, 0, 6}, +- {"seconds", required_argument, 0, 7}, +- {"thread", required_argument, 0, 8}, +- {"multi", required_argument, 0, 9}, +- {"ctxnum", required_argument, 0, 10}, +- {"prefetch", no_argument, 0, 11}, +- {"engine", required_argument, 0, 12}, +- {"alglist", no_argument, 0, 13}, +- {"latency", no_argument, 0, 14}, +- {"winsize", required_argument, 0, 15}, +- {"complevel", required_argument, 0, 16}, +- {"init2", no_argument, 0, 17}, ++ {"help", no_argument, 0, 0}, ++ {"alg", required_argument, 0, 1}, ++ {"mode", required_argument, 0, 2}, ++ {"opt", required_argument, 0, 3}, ++ {"sync", no_argument, 0, 4}, ++ {"async", no_argument, 0, 5}, ++ {"pktlen", required_argument, 0, 6}, ++ {"seconds", required_argument, 0, 7}, ++ {"thread", required_argument, 0, 8}, ++ {"multi", required_argument, 0, 9}, ++ {"ctxnum", required_argument, 0, 10}, ++ {"prefetch", no_argument, 0, 11}, ++ {"engine", required_argument, 0, 12}, ++ {"alglist", no_argument, 0, 13}, ++ {"latency", no_argument, 0, 14}, ++ {"winsize", required_argument, 0, 15}, ++ {"complevel", required_argument, 0, 16}, ++ {"init2", no_argument, 0, 17}, ++ {"device", required_argument, 0, 18}, + {0, 0, 0, 0} + }; + +@@ -826,8 +831,15 @@ int acc_cmd_parse(int argc, char *argv[], struct acc_option *option) + case 17: + option->inittype = INIT2_TYPE; + break; ++ case 18: ++ if (strlen(optarg) >= MAX_DEVICE_NAME) { ++ ACC_TST_PRT("invalid: device name is %s\n", optarg); ++ goto to_exit; ++ } ++ strcpy(option->device, optarg); ++ break; + default: +- ACC_TST_PRT("bad input test parameter!\n"); ++ ACC_TST_PRT("invalid: bad input parameter!\n"); + print_help(); + goto to_exit; + } +diff --git a/uadk_tool/benchmark/uadk_benchmark.h b/uadk_tool/benchmark/uadk_benchmark.h +index 1752948..fd3ebe5 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.h ++++ b/uadk_tool/benchmark/uadk_benchmark.h +@@ -6,27 +6,28 @@ + #include + #include + #include ++#include + #include +-#include + #include ++#include + #include + #include +-#include + #include +-#include + #include + #include ++#include + +-#define ACC_TST_PRT printf +-#define PROCESS_NUM 32 +-#define THREADS_NUM 64 +-#define MAX_CTX_NUM 64 ++#define ACC_TST_PRT printf ++#define PROCESS_NUM 32 ++#define THREADS_NUM 64 ++#define MAX_CTX_NUM 64 + #define MAX_TIME_SECONDS 128 +-#define BYTES_TO_MB 20 +-#define MAX_OPT_TYPE 6 +-#define MAX_DATA_SIZE (15 * 1024 * 1024) +-#define MAX_ALG_NAME 64 +-#define ACC_QUEUE_SIZE 1024 ++#define BYTES_TO_MB 20 ++#define MAX_OPT_TYPE 6 ++#define MAX_DATA_SIZE (15 * 1024 * 1024) ++#define MAX_ALG_NAME 64 ++#define ACC_QUEUE_SIZE 1024 ++#define MAX_DEVICE_NAME 64 + + #define MAX_BLOCK_NM 16384 /* BLOCK_NUM must 4 times of POOL_LENTH */ + #define MAX_POOL_LENTH 4096 +@@ -35,15 +36,15 @@ + #define SEC_2_USEC 1000000 + #define HASH_ZISE 16 + ++#define SCHED_SINGLE "sched_single" ++#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) ++#define gettid() syscall(__NR_gettid) ++ + typedef unsigned long long u64; + typedef unsigned int u32; + typedef unsigned short u16; + typedef unsigned char u8; + +-#define SCHED_SINGLE "sched_single" +-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +-#define gettid() syscall(__NR_gettid) +- + /** + * struct acc_option - Define the test acc app option list. + * @algclass: 0:cipher 1:digest +@@ -55,9 +56,10 @@ typedef unsigned char u8; + * @latency: test packet running time + */ + struct acc_option { +- char algname[64]; ++ char algname[MAX_ALG_NAME]; + char algclass[64]; + char engine[64]; ++ char device[MAX_DEVICE_NAME]; + u32 algtype; + u32 modetype; + u32 optype; +diff --git a/uadk_tool/benchmark/zip_uadk_benchmark.c b/uadk_tool/benchmark/zip_uadk_benchmark.c +index 9681c22..63fbdab 100644 +--- a/uadk_tool/benchmark/zip_uadk_benchmark.c ++++ b/uadk_tool/benchmark/zip_uadk_benchmark.c +@@ -16,6 +16,7 @@ + #define MAX_POOL_LENTH_COMP 1 + #define COMPRESSION_RATIO_FACTOR 0.7 + #define CHUNK_SIZE (128 * 1024) ++ + struct uadk_bd { + u8 *src; + u8 *dst; +@@ -61,6 +62,7 @@ struct zip_file_head { + + static struct wd_ctx_config g_ctx_cfg; + static struct wd_sched *g_sched; ++static struct sched_params param; + static unsigned int g_thread_num; + static unsigned int g_ctxnum; + static unsigned int g_pktlen; +@@ -240,7 +242,7 @@ static int zip_uadk_param_parse(thread_data *tddata, struct acc_option *options) + u8 alg; + + if (optype >= WD_DIR_MAX << 1) { +- ZIP_TST_PRT("Fail to get zip optype!\n"); ++ ZIP_TST_PRT("failed to get zip optype!\n"); + return -EINVAL; + } else if (optype >= WD_DIR_MAX) { + mode = STREAM_MODE; +@@ -265,7 +267,7 @@ static int zip_uadk_param_parse(thread_data *tddata, struct acc_option *options) + optype = WD_DIR_COMPRESS; + break; + default: +- ZIP_TST_PRT("Fail to set zip alg\n"); ++ ZIP_TST_PRT("failed to set zip alg\n"); + return -EINVAL; + } + +@@ -298,21 +300,22 @@ static int init_ctx_config2(struct acc_option *options) + /* init */ + ret = wd_comp_init2(alg_name, SCHED_POLICY_RR, TASK_HW); + if (ret) { +- ZIP_TST_PRT("Fail to do comp init2!\n"); ++ ZIP_TST_PRT("failed to do comp init2!\n"); + return ret; + } + + return 0; + } + +-static struct sched_params param; + static int init_ctx_config(struct acc_option *options) + { +- struct uacce_dev_list *list; ++ struct uacce_dev_list *list, *tmp; + char *alg = options->algclass; + int optype = options->optype; + int mode = options->syncmode; +- int i, max_node; ++ struct uacce_dev *dev = NULL; ++ int max_node, i; ++ char *dev_name; + int ret = 0; + + optype = optype % WD_DIR_MAX; +@@ -322,61 +325,96 @@ static int init_ctx_config(struct acc_option *options) + + list = wd_get_accel_list(alg); + if (!list) { +- ZIP_TST_PRT("Fail to get %s device\n", alg); ++ ZIP_TST_PRT("failed to get %s device\n", alg); + return -ENODEV; + } +- memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); +- g_ctx_cfg.ctx_num = g_ctxnum; +- g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); +- if (!g_ctx_cfg.ctxs) +- return -ENOMEM; + +- g_sched = wd_sched_rr_alloc(SCHED_POLICY_RR, 2, max_node, wd_comp_poll_ctx); +- if (!g_sched) { +- ZIP_TST_PRT("Fail to alloc sched!\n"); +- goto out; ++ if (strlen(options->device) == 0) { ++ dev = list->dev; ++ } else { ++ for (tmp = list; tmp; tmp = tmp->next) { ++ dev_name = strrchr(tmp->dev->dev_root, '/') + 1; ++ if (!strcmp(dev_name, options->device)) { ++ dev = tmp->dev; ++ break; ++ } ++ } ++ } ++ ++ if (dev == NULL) { ++ ZIP_TST_PRT("failed to find device %s\n", options->device); ++ ret = -ENODEV; ++ goto free_list; + } + + /* If there is no numa, we defualt config to zero */ +- if (list->dev->numa_id < 0) +- list->dev->numa_id = 0; ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ ++ memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); ++ g_ctx_cfg.ctx_num = g_ctxnum; ++ g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); ++ if (!g_ctx_cfg.ctxs) { ++ ret = -ENOMEM; ++ goto free_list; ++ } + + for (i = 0; i < g_ctxnum; i++) { +- g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(list->dev); +- g_ctx_cfg.ctxs[i].op_type = optype; // default op_type ++ g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); ++ if (!g_ctx_cfg.ctxs[i].ctx) { ++ ZIP_TST_PRT("failed to alloc %dth ctx\n", i); ++ goto free_ctx; ++ } ++ ++ g_ctx_cfg.ctxs[i].op_type = optype; + g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; + } ++ ++ g_sched = wd_sched_rr_alloc(SCHED_POLICY_RR, 2, max_node, wd_comp_poll_ctx); ++ if (!g_sched) { ++ ZIP_TST_PRT("failed to alloc sched!\n"); ++ ret = -ENOMEM; ++ goto free_ctx; ++ } ++ + g_sched->name = SCHED_SINGLE; + + /* + * All contexts for 2 modes & 2 types. + * The test only uses one kind of contexts at the same time. + */ +- param.numa_id = list->dev->numa_id; ++ param.numa_id = dev->numa_id; + param.type = optype; + param.mode = mode; + param.begin = 0; + param.end = g_ctxnum - 1; + ret = wd_sched_rr_instance(g_sched, ¶m); + if (ret) { +- ZIP_TST_PRT("Fail to fill sched data!\n"); +- goto out; ++ ZIP_TST_PRT("failed to fill sched data!\n"); ++ goto free_sched; + } + +- /* init */ + ret = wd_comp_init(&g_ctx_cfg, g_sched); + if (ret) { +- ZIP_TST_PRT("Fail to cipher ctx!\n"); +- goto out; ++ ZIP_TST_PRT("failed to cipher ctx!\n"); ++ goto free_sched; + } + + wd_free_list_accels(list); + + return 0; +-out: +- free(g_ctx_cfg.ctxs); ++ ++free_sched: + wd_sched_rr_release(g_sched); + ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ free(g_ctx_cfg.ctxs); ++ ++free_list: ++ wd_free_list_accels(list); ++ + return ret; + } + +-- +2.25.1 + diff --git a/0037-uadk_tool-support-sm3-ce-benchmark-and-function-test.patch b/0037-uadk_tool-support-sm3-ce-benchmark-and-function-test.patch new file mode 100644 index 0000000..58c0144 --- /dev/null +++ b/0037-uadk_tool-support-sm3-ce-benchmark-and-function-test.patch @@ -0,0 +1,308 @@ +From abb578a52d3bd6a34ac852afb56f7da06e8de266 Mon Sep 17 00:00:00 2001 +From: Zhiqi Song +Date: Mon, 11 Mar 2024 16:29:30 +0800 +Subject: [PATCH 37/44] uadk_tool: support sm3 ce benchmark and function test + +Support performance and function test for sm3-ce. + +Signed-off-by: Zhiqi Song +--- + uadk_tool/Makefile.am | 1 + + uadk_tool/benchmark/sec_uadk_benchmark.c | 22 +++++++++++++-- + uadk_tool/benchmark/uadk_benchmark.c | 29 ++++++++++++-------- + uadk_tool/benchmark/uadk_benchmark.h | 1 + + uadk_tool/test/test_sec.c | 35 +++++++++++++++++++++--- + 5 files changed, 69 insertions(+), 19 deletions(-) + +diff --git a/uadk_tool/Makefile.am b/uadk_tool/Makefile.am +index 7f00087..6fa0d9d 100644 +--- a/uadk_tool/Makefile.am ++++ b/uadk_tool/Makefile.am +@@ -29,6 +29,7 @@ uadk_tool_LDADD=$(libwd_la_OBJECTS) \ + ../.libs/libhisi_sec.a \ + ../.libs/libhisi_hpre.a \ + ../.libs/libhisi_zip.a \ ++ ../.libs/libisa_ce.a \ + -ldl -lnuma + else + uadk_tool_LDADD=-L../.libs -l:libwd.so.2 -l:libwd_crypto.so.2 \ +diff --git a/uadk_tool/benchmark/sec_uadk_benchmark.c b/uadk_tool/benchmark/sec_uadk_benchmark.c +index 105fb1a..c3da616 100644 +--- a/uadk_tool/benchmark/sec_uadk_benchmark.c ++++ b/uadk_tool/benchmark/sec_uadk_benchmark.c +@@ -679,6 +679,7 @@ static void uninit_ctx_config2(int subtype) + wd_aead_uninit2(); + break; + case DIGEST_TYPE: ++ case DIGEST_INSTR_TYPE: + wd_digest_uninit2(); + break; + default: +@@ -703,12 +704,23 @@ static int init_ctx_config2(struct acc_option *options) + switch(subtype) { + case CIPHER_TYPE: + ret = wd_cipher_init2(alg_name, SCHED_POLICY_RR, TASK_HW); ++ if (ret) ++ SEC_TST_PRT("failed to do cipher init2!\n"); + break; + case AEAD_TYPE: + ret = wd_aead_init2(alg_name, SCHED_POLICY_RR, TASK_HW); ++ if (ret) ++ SEC_TST_PRT("failed to do aead init2!\n"); + break; + case DIGEST_TYPE: + ret = wd_digest_init2(alg_name, SCHED_POLICY_RR, TASK_HW); ++ if (ret) ++ SEC_TST_PRT("failed to do digest init2!\n"); ++ break; ++ case DIGEST_INSTR_TYPE: ++ ret = wd_digest_init2(alg_name, SCHED_POLICY_NONE, TASK_INSTR); ++ if (ret) ++ SEC_TST_PRT("failed to do digest intruction init2!\n"); + break; + } + if (ret) { +@@ -716,7 +728,7 @@ static int init_ctx_config2(struct acc_option *options) + return ret; + } + +- return 0; ++ return ret; + } + + static void get_aead_data(u8 *addr, u32 size) +@@ -1489,8 +1501,8 @@ static void *sec_uadk_digest_sync(void *arg) + } + } + dreq.in_bytes = g_pktlen; +- dreq.out_bytes = 16; +- dreq.out_buf_bytes = 16; ++ dreq.out_bytes = 32; ++ dreq.out_buf_bytes = 32; + dreq.data_fmt = 0; + dreq.state = 0; + dreq.has_next = 0; +@@ -1536,8 +1548,12 @@ int sec_uadk_sync_threads(struct acc_option *options) + uadk_sec_sync_run = sec_uadk_aead_sync; + break; + case DIGEST_TYPE: ++ case DIGEST_INSTR_TYPE: + uadk_sec_sync_run = sec_uadk_digest_sync; + break; ++ default: ++ SEC_TST_PRT("Invalid subtype!\n"); ++ return -EINVAL; + } + + for (i = 0; i < g_thread_num; i++) { +diff --git a/uadk_tool/benchmark/uadk_benchmark.c b/uadk_tool/benchmark/uadk_benchmark.c +index 5dbe26a..9c025cf 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.c ++++ b/uadk_tool/benchmark/uadk_benchmark.c +@@ -37,6 +37,7 @@ enum test_type { + SOFT_MODE = 0x4, + SVA_SOFT = 0x5, + NOSVA_SOFT = 0x6, ++ INSTR_MODE = 0x7, + INVALID_MODE = 0x8, + }; + +@@ -51,6 +52,7 @@ static struct acc_sva_item sys_name_item[] = { + {"soft", SOFT_MODE}, + {"sva-soft", SVA_SOFT}, + {"nosva-soft", NOSVA_SOFT}, ++ {"instr", INSTR_MODE}, + }; + + struct acc_alg_item { +@@ -286,7 +288,7 @@ static int get_alg_type(const char *alg_name) + + for (i = 0; i < ALG_MAX; i++) { + if (strcmp(alg_name, alg_options[i].name) == 0) { +- alg = alg_options[i].alg; ++ alg = alg_options[i].alg; + break; + } + } +@@ -482,8 +484,11 @@ static void parse_alg_param(struct acc_option *option) + option->subtype = AEAD_TYPE; + } else if (option->algtype <= SHA512_256) { + snprintf(option->algclass, MAX_ALG_NAME, "%s", "digest"); ++ if (option->modetype == INSTR_MODE) ++ option->subtype = DIGEST_INSTR_TYPE; ++ else ++ option->subtype = DIGEST_TYPE; + option->acctype = SEC_TYPE; +- option->subtype = DIGEST_TYPE; + } + } + } +@@ -545,35 +550,35 @@ static int benchmark_run(struct acc_option *option) + + switch(option->acctype) { + case SEC_TYPE: +- if (option->modetype & SVA_MODE) { ++ if ((option->modetype == SVA_MODE) || (option->modetype == INSTR_MODE)) { + ret = sec_uadk_benchmark(option); +- } else if (option->modetype & NOSVA_MODE) { ++ } else if (option->modetype == NOSVA_MODE) { + ret = sec_wd_benchmark(option); + } + usleep(20000); + #ifdef HAVE_CRYPTO +- if (option->modetype & SOFT_MODE) { ++ if (option->modetype == SOFT_MODE) { + ret = sec_soft_benchmark(option); + } + #endif + break; + case HPRE_TYPE: +- if (option->modetype & SVA_MODE) { ++ if (option->modetype == SVA_MODE) { + ret = hpre_uadk_benchmark(option); +- } else if (option->modetype & NOSVA_MODE) { ++ } else if (option->modetype == NOSVA_MODE) { + ret = hpre_wd_benchmark(option); + } + break; + case ZIP_TYPE: +- if (option->modetype & SVA_MODE) { ++ if (option->modetype == SVA_MODE) { + ret = zip_uadk_benchmark(option); +- } else if (option->modetype & NOSVA_MODE) { ++ } else if (option->modetype == NOSVA_MODE) { + ret = zip_wd_benchmark(option); + } + case TRNG_TYPE: +- if (option->modetype & SVA_MODE) ++ if (option->modetype == SVA_MODE) + ACC_TST_PRT("TRNG not support sva mode..\n"); +- else if (option->modetype & NOSVA_MODE) ++ else if (option->modetype == NOSVA_MODE) + ret = trng_wd_benchmark(option); + + break; +@@ -698,7 +703,7 @@ static void print_help(void) + ACC_TST_PRT("DESCRIPTION\n"); + ACC_TST_PRT(" [--alg aes-128-cbc ]:\n"); + ACC_TST_PRT(" The name of the algorithm for benchmarking\n"); +- ACC_TST_PRT(" [--mode sva/nosva/soft/sva-soft/nosva-soft]: start UADK or Warpdrive or Openssl mode test\n"); ++ ACC_TST_PRT(" [--mode sva/nosva/soft/sva-soft/nosva-soft/instr]: start UADK or Warpdrive or Openssl or Instruction mode test\n"); + ACC_TST_PRT(" [--sync/--async]: start asynchronous/synchronous mode test\n"); + ACC_TST_PRT(" [--opt 0,1,2,3,4,5]:\n"); + ACC_TST_PRT(" SEC/ZIP: 0/1:encryption/decryption or compression/decompression\n"); +diff --git a/uadk_tool/benchmark/uadk_benchmark.h b/uadk_tool/benchmark/uadk_benchmark.h +index fd3ebe5..e370d3e 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.h ++++ b/uadk_tool/benchmark/uadk_benchmark.h +@@ -104,6 +104,7 @@ enum alg_type { + SM2_TYPE, + X25519_TYPE, + X448_TYPE, ++ DIGEST_INSTR_TYPE, + }; + + enum sync_type { +diff --git a/uadk_tool/test/test_sec.c b/uadk_tool/test/test_sec.c +index 16feaf0..87fc718 100644 +--- a/uadk_tool/test/test_sec.c ++++ b/uadk_tool/test/test_sec.c +@@ -32,7 +32,7 @@ + #define SCHED_SINGLE "sched_single" + #define SCHED_NULL_CTX_SIZE 4 + #define TEST_WORD_LEN 4096 +-#define MAX_ALGO_PER_TYPE 16 ++#define MAX_ALGO_PER_TYPE 17 + #define MIN_SVA_BD_NUM 1 + #define AES_KEYSIZE_128 16 + #define AES_KEYSIZE_192 24 +@@ -83,6 +83,8 @@ enum digest_type { + LOCAL_AES_GMAC_192, + LOCAL_AES_GMAC_256, + LOCAL_AES_XCBC_MAC_96, ++ LOCAL_AES_XCBC_PRF_128, ++ LOCAL_AES_CCM, + }; + + char *digest_names[MAX_ALGO_PER_TYPE] = { +@@ -102,6 +104,7 @@ char *digest_names[MAX_ALGO_PER_TYPE] = { + "xcbc-mac-96(aes)", + "xcbc-prf-128(aes)", + "ccm(aes)", /* --digest 15: for error alg test */ ++ "sm3-ce", + }; + + char *aead_names[MAX_ALGO_PER_TYPE] = { +@@ -1464,11 +1467,14 @@ static int digest_init2(int type, int mode) + { + struct wd_ctx_params cparams = {0}; + struct wd_ctx_nums *ctx_set_num; ++ char *alg_name; + int ret; + + if (g_testalg >= MAX_ALGO_PER_TYPE) + return -WD_EINVAL; + ++ alg_name = digest_names[g_testalg]; ++ + ctx_set_num = calloc(1, sizeof(*ctx_set_num)); + if (!ctx_set_num) { + WD_ERR("failed to alloc ctx_set_size!\n"); +@@ -1492,7 +1498,10 @@ static int digest_init2(int type, int mode) + if (mode == CTX_MODE_ASYNC) + ctx_set_num->async_ctx_num = g_ctxnum; + +- ret = wd_digest_init2_(digest_names[g_testalg], 0, 0, &cparams); ++ if (!strcmp(alg_name, "sm3-ce")) ++ ret = wd_digest_init2("sm3", SCHED_POLICY_NONE, TASK_INSTR); ++ else ++ ret = wd_digest_init2_(digest_names[g_testalg], 0, 0, &cparams); + if (ret) + goto out_freebmp; + +@@ -1780,7 +1789,22 @@ int get_digest_resource(struct hash_testvec **alg_tv, int* alg, int* mode) + tv->dsize = 16; + alg_type = WD_DIGEST_AES_XCBC_PRF_128; + break; +- ++ case 16: /* SM3-CE */ ++ switch (g_alg_op_type) { ++ case 0: ++ mode_type = WD_DIGEST_NORMAL; ++ SEC_TST_PRT("test alg: %s\n", "normal(sm3-ce)"); ++ tv = &sm3_tv_template[0]; ++ break; ++ case 1: ++ mode_type = WD_DIGEST_HMAC; ++ SEC_TST_PRT("test alg: %s\n", "hmac(sm3-ce)"); ++ tv = &hmac_sm3_tv_template[0]; ++ break; ++ } ++ tv->dsize = 32; ++ alg_type = WD_DIGEST_SM3; ++ break; + default: + SEC_TST_PRT("keylenth error, default test alg: %s\n", "normal(sm3)"); + return -EINVAL; +@@ -4229,7 +4253,7 @@ static void print_help(void) + SEC_TST_PRT(" 4 : SHA224; 5 : SHA384; 6 : SHA512; 7 : SHA512_224\n"); + SEC_TST_PRT(" 8 : SHA512_256; 9 : AES_CMAC; 10 : AES_GMAC_128\n"); + SEC_TST_PRT(" 11 : AES_GMAC_192; 12 : AES_GMAC_256; 13 : AES_XCBC_MAC_96\n"); +- SEC_TST_PRT(" 14 : AES_XCBC_PRF_128\n"); ++ SEC_TST_PRT(" 14 : AES_XCBC_PRF_128; 15 : SM3-CE\n"); + SEC_TST_PRT(" [--aead ]:\n"); + SEC_TST_PRT(" specify symmetric aead algorithm\n"); + SEC_TST_PRT(" 0 : AES-CCM; 1 : AES-GCM; 2 : Hmac(sha256),cbc(aes)\n"); +@@ -4257,6 +4281,9 @@ static void print_help(void) + SEC_TST_PRT(" set the steam mode for digest\n"); + SEC_TST_PRT(" [--sglnum]:\n"); + SEC_TST_PRT(" the number of scatterlist number used by the entire test task\n"); ++ SEC_TST_PRT(" [--init]:\n"); ++ SEC_TST_PRT(" 1: use init API of uadk\n"); ++ SEC_TST_PRT(" 2: use init2 API of uadk\n"); + SEC_TST_PRT(" [--help] = usage\n"); + SEC_TST_PRT("Example\n"); + SEC_TST_PRT(" ./uadk_tool test --m sec --cipher 0 --sync --optype 0\n"); +-- +2.25.1 + diff --git a/0038-uadk_tool-support-sm4-ce-benchmark-test.patch b/0038-uadk_tool-support-sm4-ce-benchmark-test.patch new file mode 100644 index 0000000..a4523c9 --- /dev/null +++ b/0038-uadk_tool-support-sm4-ce-benchmark-test.patch @@ -0,0 +1,167 @@ +From 3ca60986407cfe3b1ddd264b0bfbe24d88856d71 Mon Sep 17 00:00:00 2001 +From: Qi Tao +Date: Mon, 11 Mar 2024 16:31:35 +0800 +Subject: [PATCH 38/44] uadk_tool: support sm4 ce benchmark test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Support performance test for sm4-ce. Add CBC-CS1、CBC-CS2 +and CBC-CS3 mode for SM4 algorithm in benchmark tool. + +Signed-off-by: Qi Tao +--- + uadk_tool/benchmark/sec_uadk_benchmark.c | 25 ++++++++++++++++++++++++ + uadk_tool/benchmark/sec_wd_benchmark.c | 18 +++++++++++++++++ + uadk_tool/benchmark/uadk_benchmark.c | 11 ++++++++++- + uadk_tool/benchmark/uadk_benchmark.h | 4 ++++ + 4 files changed, 57 insertions(+), 1 deletion(-) + +diff --git a/uadk_tool/benchmark/sec_uadk_benchmark.c b/uadk_tool/benchmark/sec_uadk_benchmark.c +index c3da616..f8b19ba 100644 +--- a/uadk_tool/benchmark/sec_uadk_benchmark.c ++++ b/uadk_tool/benchmark/sec_uadk_benchmark.c +@@ -346,6 +346,24 @@ static int sec_uadk_param_parse(thread_data *tddata, struct acc_option *options) + mode = WD_CIPHER_CBC; + alg = WD_CIPHER_SM4; + break; ++ case SM4_128_CBC_CS1: ++ keysize = 16; ++ ivsize = 16; ++ mode = WD_CIPHER_CBC_CS1; ++ alg = WD_CIPHER_SM4; ++ break; ++ case SM4_128_CBC_CS2: ++ keysize = 16; ++ ivsize = 16; ++ mode = WD_CIPHER_CBC_CS2; ++ alg = WD_CIPHER_SM4; ++ break; ++ case SM4_128_CBC_CS3: ++ keysize = 16; ++ ivsize = 16; ++ mode = WD_CIPHER_CBC_CS3; ++ alg = WD_CIPHER_SM4; ++ break; + case SM4_128_CTR: + keysize = 16; + ivsize = 16; +@@ -673,6 +691,7 @@ static void uninit_ctx_config2(int subtype) + /* uninit2 */ + switch(subtype) { + case CIPHER_TYPE: ++ case CIPHER_INSTR_TYPE: + wd_cipher_uninit2(); + break; + case AEAD_TYPE: +@@ -707,6 +726,11 @@ static int init_ctx_config2(struct acc_option *options) + if (ret) + SEC_TST_PRT("failed to do cipher init2!\n"); + break; ++ case CIPHER_INSTR_TYPE: ++ ret = wd_cipher_init2(alg_name, SCHED_POLICY_NONE, TASK_INSTR); ++ if (ret) ++ SEC_TST_PRT("failed to do cipher intruction init2!\n"); ++ break; + case AEAD_TYPE: + ret = wd_aead_init2(alg_name, SCHED_POLICY_RR, TASK_HW); + if (ret) +@@ -1542,6 +1566,7 @@ int sec_uadk_sync_threads(struct acc_option *options) + + switch (options->subtype) { + case CIPHER_TYPE: ++ case CIPHER_INSTR_TYPE: + uadk_sec_sync_run = sec_uadk_cipher_sync; + break; + case AEAD_TYPE: +diff --git a/uadk_tool/benchmark/sec_wd_benchmark.c b/uadk_tool/benchmark/sec_wd_benchmark.c +index 2ed8493..bb47d61 100644 +--- a/uadk_tool/benchmark/sec_wd_benchmark.c ++++ b/uadk_tool/benchmark/sec_wd_benchmark.c +@@ -412,6 +412,24 @@ static int sec_wd_param_parse(thread_data *tddata, struct acc_option *options) + mode = WCRYPTO_CIPHER_CBC; + alg = WCRYPTO_CIPHER_SM4; + break; ++ case SM4_128_CBC_CS1: ++ keysize = 16; ++ ivsize = 16; ++ mode = WCRYPTO_CIPHER_CBC_CS1; ++ alg = WCRYPTO_CIPHER_SM4; ++ break; ++ case SM4_128_CBC_CS2: ++ keysize = 16; ++ ivsize = 16; ++ mode = WCRYPTO_CIPHER_CBC_CS2; ++ alg = WCRYPTO_CIPHER_SM4; ++ break; ++ case SM4_128_CBC_CS3: ++ keysize = 16; ++ ivsize = 16; ++ mode = WCRYPTO_CIPHER_CBC_CS3; ++ alg = WCRYPTO_CIPHER_SM4; ++ break; + case SM4_128_CTR: + keysize = 16; + ivsize = 16; +diff --git a/uadk_tool/benchmark/uadk_benchmark.c b/uadk_tool/benchmark/uadk_benchmark.c +index 9c025cf..f9bb69c 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.c ++++ b/uadk_tool/benchmark/uadk_benchmark.c +@@ -120,6 +120,9 @@ static struct acc_alg_item alg_options[] = { + {"3des-192-cbc", DES3_192_CBC}, + {"sm4-128-ecb", SM4_128_ECB}, + {"sm4-128-cbc", SM4_128_CBC}, ++ {"sm4-128-cbc-cs1", SM4_128_CBC_CS1}, ++ {"sm4-128-cbc-cs2", SM4_128_CBC_CS2}, ++ {"sm4-128-cbc-cs3", SM4_128_CBC_CS3}, + {"sm4-128-ctr", SM4_128_CTR}, + {"sm4-128-ofb", SM4_128_OFB}, + {"sm4-128-cfb", SM4_128_CFB}, +@@ -209,6 +212,9 @@ static struct acc_alg_item alg_name_options[] = { + {"cbc(des3_ede)", DES3_192_CBC}, + {"ecb(sm4)", SM4_128_ECB}, + {"cbc(sm4)", SM4_128_CBC}, ++ {"cbc-cs1(sm4)", SM4_128_CBC_CS1}, ++ {"cbc-cs2(sm4)", SM4_128_CBC_CS2}, ++ {"cbc-cs3(sm4)", SM4_128_CBC_CS3}, + {"ctr(sm4)", SM4_128_CTR}, + {"ofb(sm4)", SM4_128_OFB}, + {"cfb(sm4)", SM4_128_CFB}, +@@ -476,8 +482,11 @@ static void parse_alg_param(struct acc_option *option) + option->subtype = ECDSA_TYPE; + } else if (option->algtype <= SM4_128_XTS_GB) { + snprintf(option->algclass, MAX_ALG_NAME, "%s", "cipher"); ++ if (option->modetype == INSTR_MODE) ++ option->subtype = CIPHER_INSTR_TYPE; ++ else ++ option->subtype = CIPHER_TYPE; + option->acctype = SEC_TYPE; +- option->subtype = CIPHER_TYPE; + } else if (option->algtype <= SM4_128_GCM) { + snprintf(option->algclass, MAX_ALG_NAME, "%s", "aead"); + option->acctype = SEC_TYPE; +diff --git a/uadk_tool/benchmark/uadk_benchmark.h b/uadk_tool/benchmark/uadk_benchmark.h +index e370d3e..ea8e437 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.h ++++ b/uadk_tool/benchmark/uadk_benchmark.h +@@ -105,6 +105,7 @@ enum alg_type { + X25519_TYPE, + X448_TYPE, + DIGEST_INSTR_TYPE, ++ CIPHER_INSTR_TYPE, + }; + + enum sync_type { +@@ -172,6 +173,9 @@ enum test_alg { + DES3_192_CBC, + SM4_128_ECB, + SM4_128_CBC, ++ SM4_128_CBC_CS1, ++ SM4_128_CBC_CS2, ++ SM4_128_CBC_CS3, + SM4_128_CTR, + SM4_128_OFB, + SM4_128_CFB, +-- +2.25.1 + diff --git a/0039-uadk_tool-support-sm3-md5-multibuff-benchmark-test.patch b/0039-uadk_tool-support-sm3-md5-multibuff-benchmark-test.patch new file mode 100644 index 0000000..f78d464 --- /dev/null +++ b/0039-uadk_tool-support-sm3-md5-multibuff-benchmark-test.patch @@ -0,0 +1,272 @@ +From 7b3f79fedc187ded4dc7d6bdc976d0e560cc746d Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Mon, 11 Mar 2024 16:33:58 +0800 +Subject: [PATCH 39/44] uadk_tool: support sm3/md5 multibuff benchmark test + +Support sm3/md5 multibuff benchmark test + +Signed-off-by: Weili Qian +--- + uadk_tool/benchmark/sec_uadk_benchmark.c | 31 +++++++++++++++--------- + uadk_tool/benchmark/uadk_benchmark.c | 26 ++++++++++++++------ + uadk_tool/benchmark/uadk_benchmark.h | 3 ++- + 3 files changed, 40 insertions(+), 20 deletions(-) + +diff --git a/uadk_tool/benchmark/sec_uadk_benchmark.c b/uadk_tool/benchmark/sec_uadk_benchmark.c +index f8b19ba..2c12c20 100644 +--- a/uadk_tool/benchmark/sec_uadk_benchmark.c ++++ b/uadk_tool/benchmark/sec_uadk_benchmark.c +@@ -53,6 +53,7 @@ typedef struct uadk_thread_res { + bool is_union; + u32 dalg; + u32 dmode; ++ u32 d_outbytes; + } thread_data; + + static struct wd_ctx_config g_ctx_cfg; +@@ -146,6 +147,7 @@ static int sec_uadk_param_parse(thread_data *tddata, struct acc_option *options) + u32 algtype = options->algtype; + u32 optype = options->optype; + bool is_union = false; ++ u32 out_bytes = 32; + u8 keysize = 0; + u8 ivsize = 0; + u8 dmode = 0; +@@ -472,45 +474,54 @@ static int sec_uadk_param_parse(thread_data *tddata, struct acc_option *options) + case SM3_ALG: // digest mode is optype + keysize = 4; + mode = optype; ++ out_bytes = 32; + alg = WD_DIGEST_SM3; + break; + case MD5_ALG: + keysize = 4; ++ out_bytes = 16; + mode = optype; + alg = WD_DIGEST_MD5; + break; + case SHA1_ALG: + keysize = 4; ++ out_bytes = 20; + mode = optype; + alg = WD_DIGEST_SHA1; + break; + case SHA256_ALG: + keysize = 4; ++ out_bytes = 32; + mode = optype; + alg = WD_DIGEST_SHA256; + break; + case SHA224_ALG: + keysize = 4; ++ out_bytes = 28; + mode = optype; + alg = WD_DIGEST_SHA224; + break; + case SHA384_ALG: + keysize = 4; ++ out_bytes = 48; + mode = optype; + alg = WD_DIGEST_SHA384; + break; + case SHA512_ALG: + keysize = 4; ++ out_bytes = 64; + mode = optype; + alg = WD_DIGEST_SHA512; + break; + case SHA512_224: + keysize = 4; ++ out_bytes = 28; + mode = optype; + alg = WD_DIGEST_SHA512_224; + break; + case SHA512_256: + keysize = 4; ++ out_bytes = 32; + mode = optype; + alg = WD_DIGEST_SHA512_256; + break; +@@ -528,6 +539,7 @@ static int sec_uadk_param_parse(thread_data *tddata, struct acc_option *options) + tddata->is_union = is_union; + tddata->optype = options->optype; + tddata->subtype = options->subtype; ++ tddata->d_outbytes = out_bytes; + + return 0; + } +@@ -698,7 +710,6 @@ static void uninit_ctx_config2(int subtype) + wd_aead_uninit2(); + break; + case DIGEST_TYPE: +- case DIGEST_INSTR_TYPE: + wd_digest_uninit2(); + break; + default: +@@ -737,15 +748,10 @@ static int init_ctx_config2(struct acc_option *options) + SEC_TST_PRT("failed to do aead init2!\n"); + break; + case DIGEST_TYPE: +- ret = wd_digest_init2(alg_name, SCHED_POLICY_RR, TASK_HW); ++ ret = wd_digest_init2(alg_name, options->sched_type, options->task_type); + if (ret) + SEC_TST_PRT("failed to do digest init2!\n"); + break; +- case DIGEST_INSTR_TYPE: +- ret = wd_digest_init2(alg_name, SCHED_POLICY_NONE, TASK_INSTR); +- if (ret) +- SEC_TST_PRT("failed to do digest intruction init2!\n"); +- break; + } + if (ret) { + SEC_TST_PRT("failed to do cipher init2!\n"); +@@ -1305,8 +1311,8 @@ static void *sec_uadk_digest_async(void *arg) + } + } + dreq.in_bytes = g_pktlen; +- dreq.out_bytes = 16; +- dreq.out_buf_bytes = 16; ++ dreq.out_bytes = pdata->d_outbytes; ++ dreq.out_buf_bytes = pdata->d_outbytes; + dreq.data_fmt = 0; + dreq.state = 0; + dreq.has_next = 0; +@@ -1525,8 +1531,8 @@ static void *sec_uadk_digest_sync(void *arg) + } + } + dreq.in_bytes = g_pktlen; +- dreq.out_bytes = 32; +- dreq.out_buf_bytes = 32; ++ dreq.out_bytes = pdata->d_outbytes; ++ dreq.out_buf_bytes = pdata->d_outbytes; + dreq.data_fmt = 0; + dreq.state = 0; + dreq.has_next = 0; +@@ -1573,7 +1579,6 @@ int sec_uadk_sync_threads(struct acc_option *options) + uadk_sec_sync_run = sec_uadk_aead_sync; + break; + case DIGEST_TYPE: +- case DIGEST_INSTR_TYPE: + uadk_sec_sync_run = sec_uadk_digest_sync; + break; + default: +@@ -1591,6 +1596,7 @@ int sec_uadk_sync_threads(struct acc_option *options) + threads_args[i].ivsize = threads_option.ivsize; + threads_args[i].optype = threads_option.optype; + threads_args[i].td_id = i; ++ threads_args[i].d_outbytes = threads_option.d_outbytes; + ret = pthread_create(&tdid[i], NULL, uadk_sec_sync_run, &threads_args[i]); + if (ret) { + SEC_TST_PRT("Create sync thread fail!\n"); +@@ -1662,6 +1668,7 @@ int sec_uadk_async_threads(struct acc_option *options) + threads_args[i].ivsize = threads_option.ivsize; + threads_args[i].optype = threads_option.optype; + threads_args[i].td_id = i; ++ threads_args[i].d_outbytes = threads_option.d_outbytes; + ret = pthread_create(&tdid[i], NULL, uadk_sec_async_run, &threads_args[i]); + if (ret) { + SEC_TST_PRT("Create async thread fail!\n"); +diff --git a/uadk_tool/benchmark/uadk_benchmark.c b/uadk_tool/benchmark/uadk_benchmark.c +index f9bb69c..c1accc6 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.c ++++ b/uadk_tool/benchmark/uadk_benchmark.c +@@ -2,6 +2,8 @@ + + #include + #include ++#include "include/wd_alg_common.h" ++#include "include/wd_sched.h" + + #include "uadk_benchmark.h" + #include "sec_uadk_benchmark.h" +@@ -38,7 +40,8 @@ enum test_type { + SVA_SOFT = 0x5, + NOSVA_SOFT = 0x6, + INSTR_MODE = 0x7, +- INVALID_MODE = 0x8, ++ MULTIBUF_MODE = 0x8, ++ INVALID_MODE = 0x9, + }; + + struct acc_sva_item { +@@ -53,6 +56,7 @@ static struct acc_sva_item sys_name_item[] = { + {"sva-soft", SVA_SOFT}, + {"nosva-soft", NOSVA_SOFT}, + {"instr", INSTR_MODE}, ++ {"multibuff", MULTIBUF_MODE}, + }; + + struct acc_alg_item { +@@ -493,11 +497,15 @@ static void parse_alg_param(struct acc_option *option) + option->subtype = AEAD_TYPE; + } else if (option->algtype <= SHA512_256) { + snprintf(option->algclass, MAX_ALG_NAME, "%s", "digest"); +- if (option->modetype == INSTR_MODE) +- option->subtype = DIGEST_INSTR_TYPE; +- else +- option->subtype = DIGEST_TYPE; ++ option->subtype = DIGEST_TYPE; + option->acctype = SEC_TYPE; ++ if (option->modetype == INSTR_MODE) { ++ option->sched_type = SCHED_POLICY_NONE; ++ option->task_type = TASK_INSTR; ++ } else if (option->modetype == MULTIBUF_MODE) { ++ option->sched_type = SCHED_POLICY_SINGLE; ++ option->task_type = TASK_INSTR; ++ } + } + } + } +@@ -559,7 +567,9 @@ static int benchmark_run(struct acc_option *option) + + switch(option->acctype) { + case SEC_TYPE: +- if ((option->modetype == SVA_MODE) || (option->modetype == INSTR_MODE)) { ++ if ((option->modetype == SVA_MODE) || ++ (option->modetype == INSTR_MODE) || ++ (option->modetype == MULTIBUF_MODE)) { + ret = sec_uadk_benchmark(option); + } else if (option->modetype == NOSVA_MODE) { + ret = sec_wd_benchmark(option); +@@ -623,6 +633,8 @@ int acc_benchmark_run(struct acc_option *option) + int i, ret = 0; + int status; + ++ option->sched_type = SCHED_POLICY_RR; ++ option->task_type = TASK_HW; + parse_alg_param(option); + dump_param(option); + g_run_options = option; +@@ -712,7 +724,7 @@ static void print_help(void) + ACC_TST_PRT("DESCRIPTION\n"); + ACC_TST_PRT(" [--alg aes-128-cbc ]:\n"); + ACC_TST_PRT(" The name of the algorithm for benchmarking\n"); +- ACC_TST_PRT(" [--mode sva/nosva/soft/sva-soft/nosva-soft/instr]: start UADK or Warpdrive or Openssl or Instruction mode test\n"); ++ ACC_TST_PRT(" [--mode sva/nosva/soft/sva-soft/nosva-soft/instr/multibuff]: start UADK or Warpdrive or Openssl or Instruction mode test\n"); + ACC_TST_PRT(" [--sync/--async]: start asynchronous/synchronous mode test\n"); + ACC_TST_PRT(" [--opt 0,1,2,3,4,5]:\n"); + ACC_TST_PRT(" SEC/ZIP: 0/1:encryption/decryption or compression/decompression\n"); +diff --git a/uadk_tool/benchmark/uadk_benchmark.h b/uadk_tool/benchmark/uadk_benchmark.h +index ea8e437..c493ac3 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.h ++++ b/uadk_tool/benchmark/uadk_benchmark.h +@@ -77,6 +77,8 @@ struct acc_option { + u32 complevel; + u32 inittype; + bool latency; ++ u32 sched_type; ++ int task_type; + }; + + enum acc_type { +@@ -104,7 +106,6 @@ enum alg_type { + SM2_TYPE, + X25519_TYPE, + X448_TYPE, +- DIGEST_INSTR_TYPE, + CIPHER_INSTR_TYPE, + }; + +-- +2.25.1 + diff --git a/0040-uadk-tool-fix-the-msg-pool-release-bug-of-async-zip-.patch b/0040-uadk-tool-fix-the-msg-pool-release-bug-of-async-zip-.patch new file mode 100644 index 0000000..9eaee1f --- /dev/null +++ b/0040-uadk-tool-fix-the-msg-pool-release-bug-of-async-zip-.patch @@ -0,0 +1,467 @@ +From 34c49db7d9eba5255f678179da95a15976dbb305 Mon Sep 17 00:00:00 2001 +From: Chenghai Huang +Date: Mon, 11 Mar 2024 16:36:13 +0800 +Subject: [PATCH 40/44] uadk tool: fix the msg pool release bug of async zip + benchmark + +Ensure that all packets in the msg pool are removed before end. +In V2, resources such as tags are released in a centralized manner +to avoid errors caused by asynchronous resource release sequence. +In V1, before the packet sending thread releases the tag, ensure +that the poll thread has ended. + +Signed-off-by: Chenghai Huang +--- + uadk_tool/benchmark/uadk_benchmark.c | 4 +- + uadk_tool/benchmark/zip_uadk_benchmark.c | 151 ++++++++++++----------- + uadk_tool/benchmark/zip_wd_benchmark.c | 25 ++-- + 3 files changed, 99 insertions(+), 81 deletions(-) + +diff --git a/uadk_tool/benchmark/uadk_benchmark.c b/uadk_tool/benchmark/uadk_benchmark.c +index c1accc6..1262a2a 100644 +--- a/uadk_tool/benchmark/uadk_benchmark.c ++++ b/uadk_tool/benchmark/uadk_benchmark.c +@@ -594,6 +594,7 @@ static int benchmark_run(struct acc_option *option) + } else if (option->modetype == NOSVA_MODE) { + ret = zip_wd_benchmark(option); + } ++ break; + case TRNG_TYPE: + if (option->modetype == SVA_MODE) + ACC_TST_PRT("TRNG not support sva mode..\n"); +@@ -727,7 +728,8 @@ static void print_help(void) + ACC_TST_PRT(" [--mode sva/nosva/soft/sva-soft/nosva-soft/instr/multibuff]: start UADK or Warpdrive or Openssl or Instruction mode test\n"); + ACC_TST_PRT(" [--sync/--async]: start asynchronous/synchronous mode test\n"); + ACC_TST_PRT(" [--opt 0,1,2,3,4,5]:\n"); +- ACC_TST_PRT(" SEC/ZIP: 0/1:encryption/decryption or compression/decompression\n"); ++ ACC_TST_PRT(" SEC: cipher,aead: 0/1:encryption/decryption; digest: 0/1:normal/hmac\n"); ++ ACC_TST_PRT(" ZIP: 0~1:block compression, block decompression; 2~3:stream compression, stream decompression\n"); + ACC_TST_PRT(" HPRE: 0~5:keygen, key compute, Enc, Dec, Sign, Verify\n"); + ACC_TST_PRT(" [--pktlen]:\n"); + ACC_TST_PRT(" set the length of BD message in bytes\n"); +diff --git a/uadk_tool/benchmark/zip_uadk_benchmark.c b/uadk_tool/benchmark/zip_uadk_benchmark.c +index 63fbdab..1dd3990 100644 +--- a/uadk_tool/benchmark/zip_uadk_benchmark.c ++++ b/uadk_tool/benchmark/zip_uadk_benchmark.c +@@ -16,7 +16,7 @@ + #define MAX_POOL_LENTH_COMP 1 + #define COMPRESSION_RATIO_FACTOR 0.7 + #define CHUNK_SIZE (128 * 1024) +- ++#define MAX_UNRECV_PACKET_NUM 2 + struct uadk_bd { + u8 *src; + u8 *dst; +@@ -37,11 +37,17 @@ enum ZIP_OP_MODE { + STREAM_MODE + }; + ++enum ZIP_THREAD_STATE { ++ THREAD_PROCESSING, ++ THREAD_COMPLETED ++}; ++ + struct zip_async_tag { + handle_t sess; + u32 td_id; + u32 bd_idx; + u32 cm_len; ++ u32 recv_cnt; + ZSTD_CCtx *cctx; + }; + +@@ -52,6 +58,10 @@ typedef struct uadk_thread_res { + u32 td_id; + u32 win_sz; + u32 comp_lv; ++ u32 send_cnt; ++ struct zip_async_tag *tag; ++ COMP_TUPLE_TAG *ftuple; ++ char *hw_buff_out; + } thread_data; + + struct zip_file_head { +@@ -67,6 +77,7 @@ static unsigned int g_thread_num; + static unsigned int g_ctxnum; + static unsigned int g_pktlen; + static unsigned int g_prefetch; ++static unsigned int g_state; + + #ifndef ZLIB_FSE + static ZSTD_CCtx* zstd_soft_fse_init(unsigned int level) +@@ -541,6 +552,7 @@ static void *zip_lz77_async_cb(struct wd_comp_req *req, void *data) + zstd_output.dst = uadk_pool->bds[idx].dst; + zstd_output.size = tag->cm_len; + zstd_output.pos = 0; ++ __atomic_add_fetch(&tag->recv_cnt, 1, __ATOMIC_RELAXED); + fse_size = zstd_soft_fse(req->priv, &zstd_input, &zstd_output, cctx, ZSTD_e_end); + + uadk_pool->bds[idx].dst_len = fse_size; +@@ -554,6 +566,7 @@ static void *zip_async_cb(struct wd_comp_req *req, void *data) + struct bd_pool *uadk_pool; + int td_id = tag->td_id; + int idx = tag->bd_idx; ++ __atomic_add_fetch(&tag->recv_cnt, 1, __ATOMIC_RELAXED); + + uadk_pool = &g_zip_pool.pool[td_id]; + uadk_pool->bds[idx].dst_len = req->dst_len; +@@ -566,15 +579,14 @@ static void *zip_uadk_poll(void *data) + thread_data *pdata = (thread_data *)data; + u32 expt = ACC_QUEUE_SIZE * g_thread_num; + u32 id = pdata->td_id; +- u32 last_time = 2; // poll need one more recv time + u32 count = 0; + u32 recv = 0; +- int ret; ++ int ret; + + if (id > g_ctxnum) + return NULL; + +- while (last_time) { ++ while (g_state == THREAD_PROCESSING) { + ret = wd_comp_poll_ctx(id, expt, &recv); + count += recv; + recv = 0; +@@ -582,9 +594,6 @@ static void *zip_uadk_poll(void *data) + ZIP_TST_PRT("poll ret: %d!\n", ret); + goto recv_error; + } +- +- if (get_run_state() == 0) +- last_time--; + } + + recv_error: +@@ -596,12 +605,11 @@ recv_error: + static void *zip_uadk_poll2(void *data) + { + u32 expt = ACC_QUEUE_SIZE * g_thread_num; +- u32 last_time = 2; // poll need one more recv time + u32 count = 0; + u32 recv = 0; + int ret; + +- while (last_time) { ++ while (g_state == THREAD_PROCESSING) { + ret = wd_comp_poll(expt, &recv); + count += recv; + recv = 0; +@@ -609,9 +617,6 @@ static void *zip_uadk_poll2(void *data) + ZIP_TST_PRT("poll ret: %d!\n", ret); + goto recv_error; + } +- +- if (get_run_state() == 0) +- last_time--; + } + + recv_error: +@@ -803,11 +808,8 @@ static void *zip_uadk_blk_lz77_async_run(void *arg) + thread_data *pdata = (thread_data *)arg; + struct wd_comp_sess_setup comp_setup = {0}; + ZSTD_CCtx *cctx = zstd_soft_fse_init(15); +- COMP_TUPLE_TAG *ftuple = NULL; + struct bd_pool *uadk_pool; + struct wd_comp_req creq; +- struct zip_async_tag *tag; +- char *hw_buff_out = NULL; + handle_t h_sess; + u32 out_len = 0; + u32 count = 0; +@@ -838,37 +840,22 @@ static void *zip_uadk_blk_lz77_async_run(void *arg) + creq.data_fmt = 0; + creq.status = 0; + +- ftuple = malloc(sizeof(COMP_TUPLE_TAG) * MAX_POOL_LENTH_COMP); +- if (!ftuple) +- goto fse_err; +- +- hw_buff_out = malloc(out_len * MAX_POOL_LENTH_COMP); +- if (!hw_buff_out) +- goto hw_buff_err; +- memset(hw_buff_out, 0x0, out_len * MAX_POOL_LENTH_COMP); +- +- tag = malloc(sizeof(*tag) * MAX_POOL_LENTH_COMP); +- if (!tag) { +- ZIP_TST_PRT("failed to malloc zip tag!\n"); +- goto tag_err; +- } +- + while(1) { + if (get_run_state() == 0) + break; + + i = count % MAX_POOL_LENTH_COMP; + creq.src = uadk_pool->bds[i].src; +- creq.dst = &hw_buff_out[i]; //temp out ++ creq.dst = &pdata->hw_buff_out[i]; //temp out + creq.src_len = uadk_pool->bds[i].src_len; + creq.dst_len = out_len; +- creq.priv = &ftuple[i]; ++ creq.priv = &pdata->ftuple[i]; + +- tag[i].td_id = pdata->td_id; +- tag[i].bd_idx = i; +- tag[i].cm_len = out_len; +- tag[i].cctx = cctx; +- creq.cb_param = &tag[i]; ++ pdata->tag[i].td_id = pdata->td_id; ++ pdata->tag[i].bd_idx = i; ++ pdata->tag[i].cm_len = out_len; ++ pdata->tag[i].cctx = cctx; ++ creq.cb_param = &pdata->tag[i]; + + ret = wd_do_comp_async(h_sess, &creq); + if (ret == -WD_EBUSY) { +@@ -884,20 +871,8 @@ static void *zip_uadk_blk_lz77_async_run(void *arg) + } + try_cnt = 0; + count++; ++ __atomic_add_fetch(&pdata->send_cnt, 1, __ATOMIC_RELAXED); + } +- +- while (1) { +- if (get_recv_time() > 0) // wait Async mode finish recv +- break; +- usleep(SEND_USLEEP); +- } +- +-tag_err: +- free(tag); +-hw_buff_err: +- free(hw_buff_out); +-fse_err: +- free(ftuple); + wd_comp_free_sess(h_sess); + add_send_complete(); + +@@ -1033,7 +1008,6 @@ static void *zip_uadk_blk_async_run(void *arg) + thread_data *pdata = (thread_data *)arg; + struct wd_comp_sess_setup comp_setup = {0}; + struct bd_pool *uadk_pool; +- struct zip_async_tag *tag; + struct wd_comp_req creq; + handle_t h_sess; + int try_cnt = 0; +@@ -1066,13 +1040,6 @@ static void *zip_uadk_blk_async_run(void *arg) + creq.priv = 0; + creq.status = 0; + +- tag = malloc(sizeof(*tag) * MAX_POOL_LENTH_COMP); +- if (!tag) { +- ZIP_TST_PRT("failed to malloc zip tag!\n"); +- wd_comp_free_sess(h_sess); +- return NULL; +- } +- + while(1) { + if (get_run_state() == 0) + break; +@@ -1083,9 +1050,9 @@ static void *zip_uadk_blk_async_run(void *arg) + creq.src_len = uadk_pool->bds[i].src_len; + creq.dst_len = out_len; + +- tag[i].td_id = pdata->td_id; +- tag[i].bd_idx = i; +- creq.cb_param = &tag[i]; ++ pdata->tag[i].td_id = pdata->td_id; ++ pdata->tag[i].bd_idx = i; ++ creq.cb_param = &pdata->tag[i]; + + ret = wd_do_comp_async(h_sess, &creq); + if (ret == -WD_EBUSY) { +@@ -1101,15 +1068,9 @@ static void *zip_uadk_blk_async_run(void *arg) + } + try_cnt = 0; + count++; ++ __atomic_add_fetch(&pdata->send_cnt, 1, __ATOMIC_RELAXED); + } + +- while (1) { +- if (get_recv_time() > 0) // wait Async mode finish recv +- break; +- usleep(SEND_USLEEP); +- } +- +- free(tag); + wd_comp_free_sess(h_sess); + + add_send_complete(); +@@ -1215,10 +1176,35 @@ static int zip_uadk_async_threads(struct acc_option *options) + threads_args[i].win_sz = threads_option.win_sz; + threads_args[i].comp_lv = threads_option.comp_lv; + threads_args[i].td_id = i; ++ if (threads_option.alg == LZ77_ZSTD) { ++ struct bd_pool *uadk_pool = &g_zip_pool.pool[i]; ++ u32 out_len = uadk_pool->bds[0].dst_len; ++ ++ threads_args[i].ftuple = malloc(sizeof(COMP_TUPLE_TAG) * ++ MAX_POOL_LENTH_COMP); ++ if (!threads_args[i].ftuple) { ++ ZIP_TST_PRT("failed to malloc lz77 ftuple!\n"); ++ goto lz77_free; ++ } ++ ++ threads_args[i].hw_buff_out = malloc(out_len * MAX_POOL_LENTH_COMP); ++ if (!threads_args[i].hw_buff_out) { ++ ZIP_TST_PRT("failed to malloc lz77 hw_buff_out!\n"); ++ goto lz77_free; ++ } ++ memset(threads_args[i].hw_buff_out, 0x0, out_len * MAX_POOL_LENTH_COMP); ++ } ++ threads_args[i].tag = malloc(sizeof(struct zip_async_tag) * MAX_POOL_LENTH_COMP); ++ if (!threads_args[i].tag) { ++ ZIP_TST_PRT("failed to malloc zip tag!\n"); ++ goto tag_free; ++ } ++ threads_args[i].tag->recv_cnt = 0; ++ threads_args[i].send_cnt = 0; + ret = pthread_create(&tdid[i], NULL, uadk_zip_async_run, &threads_args[i]); + if (ret) { + ZIP_TST_PRT("Create async thread fail!\n"); +- goto async_error; ++ goto tag_free; + } + } + +@@ -1227,18 +1213,41 @@ static int zip_uadk_async_threads(struct acc_option *options) + ret = pthread_join(tdid[i], NULL); + if (ret) { + ZIP_TST_PRT("Join async thread fail!\n"); +- goto async_error; ++ goto tag_free; + } + } + ++ /* wait for the poll to clear packets */ ++ g_state = THREAD_PROCESSING; ++ for (i = 0; i < g_thread_num;) { ++ if (threads_args[i].send_cnt <= threads_args[i].tag->recv_cnt + MAX_UNRECV_PACKET_NUM) ++ i++; ++ } ++ g_state = THREAD_COMPLETED; // finish poll ++ + for (i = 0; i < g_ctxnum; i++) { + ret = pthread_join(pollid[i], NULL); + if (ret) { + ZIP_TST_PRT("Join poll thread fail!\n"); +- goto async_error; ++ goto tag_free; + } + } + ++tag_free: ++ for (i = 0; i < g_thread_num; i++) { ++ if (threads_args[i].tag) ++ free(threads_args[i].tag); ++ } ++lz77_free: ++ if (threads_option.alg == LZ77_ZSTD) { ++ for (i = 0; i < g_thread_num; i++) { ++ if (threads_args[i].ftuple) ++ free(threads_args[i].ftuple); ++ ++ if (threads_args[i].hw_buff_out) ++ free(threads_args[i].hw_buff_out); ++ } ++ } + async_error: + return ret; + } +diff --git a/uadk_tool/benchmark/zip_wd_benchmark.c b/uadk_tool/benchmark/zip_wd_benchmark.c +index 4424e08..cbe07fc 100644 +--- a/uadk_tool/benchmark/zip_wd_benchmark.c ++++ b/uadk_tool/benchmark/zip_wd_benchmark.c +@@ -21,6 +21,7 @@ + #define COMPRESSION_RATIO_FACTOR 0.7 + #define MAX_POOL_LENTH_COMP 512 + #define CHUNK_SIZE (128 * 1024) ++#define MAX_UNRECV_PACKET_NUM 2 + + #define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) + #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) +@@ -49,6 +50,11 @@ enum ZIP_OP_MODE { + STREAM_MODE + }; + ++enum ZIP_THREAD_STATE { ++ THREAD_PROCESSING, ++ THREAD_COMPLETED ++}; ++ + struct zip_async_tag { + void *ctx; + u32 td_id; +@@ -75,6 +81,8 @@ struct zip_file_head { + + static unsigned int g_thread_num; + static unsigned int g_pktlen; ++static unsigned int g_send_cnt[THREADS_NUM]; ++static unsigned int g_recv_state[THREADS_NUM]; + + static int save_file_data(const char *alg, u32 pkg_len, u32 optype) + { +@@ -470,9 +478,10 @@ static void *zip_wd_poll(void *data) + count += recv; + recv = 0; + +- if (get_run_state() == 0) ++ if (get_run_state() == 0 && g_send_cnt[id] <= count + MAX_UNRECV_PACKET_NUM) + last_time--; + } ++ g_recv_state[id] = THREAD_COMPLETED; + + recv_error: + add_recv_data(count, g_pktlen); +@@ -746,13 +755,11 @@ static void *zip_wd_blk_lz77_async_run(void *arg) + } + try_cnt = 0; + count++; ++ __atomic_add_fetch(&g_send_cnt[pdata->td_id], 1, __ATOMIC_RELAXED); + } + +- while (1) { +- if (get_recv_time() > 0) // wait Async mode finish recv +- break; ++ while (g_recv_state[pdata->td_id] == THREAD_PROCESSING) + usleep(SEND_USLEEP); +- } + + free(tag); + tag_err: +@@ -1011,13 +1018,11 @@ static void *zip_wd_blk_async_run(void *arg) + } + try_cnt = 0; + count++; ++ __atomic_add_fetch(&g_send_cnt[pdata->td_id], 1, __ATOMIC_RELAXED); + } + +- while (1) { +- if (get_recv_time() > 0) // wait Async mode finish recv +- break; ++ while (g_recv_state[pdata->td_id] == THREAD_PROCESSING) + usleep(SEND_USLEEP); +- } + + tag_release: + free(tag); +@@ -1107,6 +1112,7 @@ static int zip_wd_async_threads(struct acc_option *options) + + for (i = 0; i < g_thread_num; i++) { + threads_args[i].td_id = i; ++ g_recv_state[i] = THREAD_PROCESSING; + /* poll thread */ + ret = pthread_create(&pollid[i], NULL, zip_wd_poll, &threads_args[i]); + if (ret) { +@@ -1122,6 +1128,7 @@ static int zip_wd_async_threads(struct acc_option *options) + threads_args[i].comp_lv = threads_option.comp_lv; + threads_args[i].win_size = threads_option.win_size; + threads_args[i].td_id = i; ++ g_send_cnt[i] = 0; + ret = pthread_create(&tdid[i], NULL, wd_zip_async_run, &threads_args[i]); + if (ret) { + ZIP_TST_PRT("Create async thread fail!\n"); +-- +2.25.1 + diff --git a/0041-uadk_tool-fix-queue-application-failure-from-multipl.patch b/0041-uadk_tool-fix-queue-application-failure-from-multipl.patch new file mode 100644 index 0000000..d0b59b8 --- /dev/null +++ b/0041-uadk_tool-fix-queue-application-failure-from-multipl.patch @@ -0,0 +1,641 @@ +From 5210ac8a3f616f381d3990e3ca3f92bf23383f25 Mon Sep 17 00:00:00 2001 +From: Qi Tao +Date: Mon, 11 Mar 2024 16:41:41 +0800 +Subject: [PATCH 41/44] uadk_tool: fix queue application failure from multiple + devices + +Specified device: apply queues from a designated device. +No specified device: apply queues from multiple devices. + +Signed-off-by: Qi Tao +--- + uadk_tool/benchmark/hpre_uadk_benchmark.c | 143 ++++++++++++++++----- + uadk_tool/benchmark/sec_uadk_benchmark.c | 141 +++++++++++++++----- + uadk_tool/benchmark/zip_uadk_benchmark.c | 150 ++++++++++++++++------ + 3 files changed, 329 insertions(+), 105 deletions(-) + +diff --git a/uadk_tool/benchmark/hpre_uadk_benchmark.c b/uadk_tool/benchmark/hpre_uadk_benchmark.c +index 729728f..0148e56 100644 +--- a/uadk_tool/benchmark/hpre_uadk_benchmark.c ++++ b/uadk_tool/benchmark/hpre_uadk_benchmark.c +@@ -344,21 +344,17 @@ static int hpre_uadk_param_parse(thread_data *tddata, struct acc_option *options + return 0; + } + +-static int init_hpre_ctx_config(struct acc_option *options) ++static int specified_device_request_ctx(struct acc_option *options) + { +- struct uacce_dev_list *list, *tmp; +- int subtype = options->subtype; ++ struct uacce_dev_list *list = NULL; ++ struct uacce_dev_list *tmp = NULL; + char *alg = options->algclass; + int mode = options->syncmode; + struct uacce_dev *dev = NULL; +- struct sched_params param; +- int max_node, i; ++ int avail_ctx = 0; + char *dev_name; + int ret = 0; +- +- max_node = numa_max_node() + 1; +- if (max_node <= 0) +- return -EINVAL; ++ int i = 0; + + list = wd_get_accel_list(alg); + if (!list) { +@@ -366,15 +362,11 @@ static int init_hpre_ctx_config(struct acc_option *options) + return -ENODEV; + } + +- if (strlen(options->device) == 0) { +- dev = list->dev; +- } else { +- for (tmp = list; tmp; tmp = tmp->next) { +- dev_name = strrchr(tmp->dev->dev_root, '/') + 1; +- if (!strcmp(dev_name, options->device)) { +- dev = tmp->dev; +- break; +- } ++ for (tmp = list; tmp != NULL; tmp = tmp->next) { ++ dev_name = strrchr(tmp->dev->dev_root, '/') + 1; ++ if (!strcmp(dev_name, options->device)) { ++ dev = tmp->dev; ++ break; + } + } + +@@ -384,30 +376,114 @@ static int init_hpre_ctx_config(struct acc_option *options) + goto free_list; + } + +- /* If there is no numa, we defualt config to zero */ +- if (dev->numa_id < 0) +- dev->numa_id = 0; +- +- memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); +- g_ctx_cfg.ctx_num = g_ctxnum; +- g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); +- if (!g_ctx_cfg.ctxs) { +- ret = -ENOMEM; ++ avail_ctx = wd_get_avail_ctx(dev); ++ if (avail_ctx < 0) { ++ HPRE_TST_PRT("failed to get the number of available ctx from %s\n", options->device); ++ ret = avail_ctx; ++ goto free_list; ++ } else if (avail_ctx < g_ctxnum) { ++ HPRE_TST_PRT("error: not enough ctx available in %s\n", options->device); ++ ret = -ENODEV; + goto free_list; + } + ++ /* If there is no numa, we default config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ + for (i = 0; i < g_ctxnum; i++) { + g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); + if (!g_ctx_cfg.ctxs[i].ctx) { + HPRE_TST_PRT("failed to alloc %dth ctx\n", i); +- ret = -ENODEV; ++ ret = -ENOMEM; + goto free_ctx; + } +- + g_ctx_cfg.ctxs[i].op_type = 0; + g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; + } + ++ wd_free_list_accels(list); ++ return 0; ++ ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ ++free_list: ++ wd_free_list_accels(list); ++ ++ return ret; ++} ++ ++static int non_specified_device_request_ctx(struct acc_option *options) ++{ ++ char *alg = options->algclass; ++ int mode = options->syncmode; ++ struct uacce_dev *dev = NULL; ++ int ret = 0; ++ int i = 0; ++ ++ while (i < g_ctxnum) { ++ dev = wd_get_accel_dev(alg); ++ if (!dev) { ++ HPRE_TST_PRT("failed to get %s device\n", alg); ++ ret = -ENODEV; ++ goto free_ctx; ++ } ++ ++ /* If there is no numa, we default config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ ++ for (; i < g_ctxnum; i++) { ++ g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); ++ if (!g_ctx_cfg.ctxs[i].ctx) ++ break; ++ ++ g_ctx_cfg.ctxs[i].op_type = 0; ++ g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; ++ } ++ ++ free(dev); ++ } ++ ++ return 0; ++ ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ ++ return ret; ++} ++ ++static int init_hpre_ctx_config(struct acc_option *options) ++{ ++ struct sched_params param = {0}; ++ int subtype = options->subtype; ++ int mode = options->syncmode; ++ int max_node; ++ int ret = 0; ++ ++ max_node = numa_max_node() + 1; ++ if (max_node <= 0) ++ return -EINVAL; ++ ++ memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); ++ g_ctx_cfg.ctx_num = g_ctxnum; ++ g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); ++ if (!g_ctx_cfg.ctxs) ++ return -ENOMEM; ++ ++ if (strlen(options->device) != 0) ++ ret = specified_device_request_ctx(options); ++ else ++ ret = non_specified_device_request_ctx(options); ++ ++ if (ret) { ++ HPRE_TST_PRT("failed to request hpre ctx!\n"); ++ goto free_ctxs; ++ } ++ + switch(subtype) { + case RSA_TYPE: + g_sched = wd_sched_rr_alloc(SCHED_POLICY_RR, 1, max_node, wd_rsa_poll_ctx); +@@ -460,7 +536,7 @@ static int init_hpre_ctx_config(struct acc_option *options) + break; + } + if (ret) { +- HPRE_TST_PRT("failed to get hpre ctx!\n"); ++ HPRE_TST_PRT("failed to init hpre ctx!\n"); + goto free_sched; + } + +@@ -470,12 +546,11 @@ free_sched: + wd_sched_rr_release(g_sched); + + free_ctx: +- for (; i >= 0; i--) ++ for (int i = g_ctxnum; i >= 0; i--) + wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); +- free(g_ctx_cfg.ctxs); + +-free_list: +- wd_free_list_accels(list); ++free_ctxs: ++ free(g_ctx_cfg.ctxs); + + return ret; + } +diff --git a/uadk_tool/benchmark/sec_uadk_benchmark.c b/uadk_tool/benchmark/sec_uadk_benchmark.c +index 2c12c20..56f4fa6 100644 +--- a/uadk_tool/benchmark/sec_uadk_benchmark.c ++++ b/uadk_tool/benchmark/sec_uadk_benchmark.c +@@ -544,21 +544,17 @@ static int sec_uadk_param_parse(thread_data *tddata, struct acc_option *options) + return 0; + } + +-static int init_ctx_config(struct acc_option *options) ++static int specified_device_request_ctx(struct acc_option *options) + { +- struct uacce_dev_list *list, *tmp; +- struct sched_params param = {0}; +- int subtype = options->subtype; ++ struct uacce_dev_list *list = NULL; ++ struct uacce_dev_list *tmp = NULL; + char *alg = options->algclass; + int mode = options->syncmode; + struct uacce_dev *dev = NULL; +- int max_node, i; ++ int avail_ctx = 0; + char *dev_name; + int ret = 0; +- +- max_node = numa_max_node() + 1; +- if (max_node <= 0) +- return -EINVAL; ++ int i = 0; + + list = wd_get_accel_list(alg); + if (!list) { +@@ -566,15 +562,11 @@ static int init_ctx_config(struct acc_option *options) + return -ENODEV; + } + +- if (strlen(options->device) == 0) { +- dev = list->dev; +- } else { +- for (tmp = list; tmp; tmp = tmp->next) { +- dev_name = strrchr(tmp->dev->dev_root, '/') + 1; +- if (!strcmp(dev_name, options->device)) { +- dev = tmp->dev; +- break; +- } ++ for (tmp = list; tmp != NULL; tmp = tmp->next) { ++ dev_name = strrchr(tmp->dev->dev_root, '/') + 1; ++ if (!strcmp(dev_name, options->device)) { ++ dev = tmp->dev; ++ break; + } + } + +@@ -584,18 +576,21 @@ static int init_ctx_config(struct acc_option *options) + goto free_list; + } + +- /* If there is no numa, we defualt config to zero */ +- if (dev->numa_id < 0) +- dev->numa_id = 0; +- +- memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); +- g_ctx_cfg.ctx_num = g_ctxnum; +- g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); +- if (!g_ctx_cfg.ctxs) { +- ret = -ENOMEM; ++ avail_ctx = wd_get_avail_ctx(dev); ++ if (avail_ctx < 0) { ++ SEC_TST_PRT("failed to get the number of available ctx from %s\n", options->device); ++ ret = avail_ctx; ++ goto free_list; ++ } else if (avail_ctx < g_ctxnum) { ++ SEC_TST_PRT("error: not enough ctx available in %s\n", options->device); ++ ret = -ENODEV; + goto free_list; + } + ++ /* If there is no numa, we default config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ + for (i = 0; i < g_ctxnum; i++) { + g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); + if (!g_ctx_cfg.ctxs[i].ctx) { +@@ -603,11 +598,92 @@ static int init_ctx_config(struct acc_option *options) + ret = -ENOMEM; + goto free_ctx; + } +- + g_ctx_cfg.ctxs[i].op_type = 0; + g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; + } + ++ wd_free_list_accels(list); ++ return 0; ++ ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ ++free_list: ++ wd_free_list_accels(list); ++ ++ return ret; ++} ++ ++static int non_specified_device_request_ctx(struct acc_option *options) ++{ ++ char *alg = options->algclass; ++ int mode = options->syncmode; ++ struct uacce_dev *dev = NULL; ++ int ret = 0; ++ int i = 0; ++ ++ while (i < g_ctxnum) { ++ dev = wd_get_accel_dev(alg); ++ if (!dev) { ++ SEC_TST_PRT("failed to get %s device\n", alg); ++ ret = -ENODEV; ++ goto free_ctx; ++ } ++ ++ /* If there is no numa, we default config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ ++ for (; i < g_ctxnum; i++) { ++ g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); ++ if (!g_ctx_cfg.ctxs[i].ctx) ++ break; ++ ++ g_ctx_cfg.ctxs[i].op_type = 0; ++ g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; ++ } ++ ++ free(dev); ++ } ++ ++ return 0; ++ ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ ++ return ret; ++} ++ ++static int init_ctx_config(struct acc_option *options) ++{ ++ struct sched_params param = {0}; ++ int subtype = options->subtype; ++ int mode = options->syncmode; ++ int max_node; ++ int ret = 0; ++ ++ max_node = numa_max_node() + 1; ++ if (max_node <= 0) ++ return -EINVAL; ++ ++ memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); ++ g_ctx_cfg.ctx_num = g_ctxnum; ++ g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); ++ if (!g_ctx_cfg.ctxs) ++ return -ENOMEM; ++ ++ if (strlen(options->device) != 0) ++ ret = specified_device_request_ctx(options); ++ else ++ ret = non_specified_device_request_ctx(options); ++ ++ if (ret) { ++ SEC_TST_PRT("failed to request sec ctx!\n"); ++ goto free_ctxs; ++ } ++ + switch(subtype) { + case CIPHER_TYPE: + g_sched = wd_sched_rr_alloc(SCHED_POLICY_RR, 1, max_node, wd_cipher_poll_ctx); +@@ -652,7 +728,7 @@ static int init_ctx_config(struct acc_option *options) + break; + } + if (ret) { +- SEC_TST_PRT("failed to cipher ctx!\n"); ++ SEC_TST_PRT("failed to init sec ctx!\n"); + goto free_sched; + } + +@@ -662,12 +738,11 @@ free_sched: + wd_sched_rr_release(g_sched); + + free_ctx: +- for (; i >= 0; i--) ++ for (int i = g_ctxnum; i >= 0; i--) + wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); +- free(g_ctx_cfg.ctxs); + +-free_list: +- wd_free_list_accels(list); ++free_ctxs: ++ free(g_ctx_cfg.ctxs); + + return ret; + } +diff --git a/uadk_tool/benchmark/zip_uadk_benchmark.c b/uadk_tool/benchmark/zip_uadk_benchmark.c +index 1dd3990..e2876a9 100644 +--- a/uadk_tool/benchmark/zip_uadk_benchmark.c ++++ b/uadk_tool/benchmark/zip_uadk_benchmark.c +@@ -318,21 +318,17 @@ static int init_ctx_config2(struct acc_option *options) + return 0; + } + +-static int init_ctx_config(struct acc_option *options) ++static int specified_device_request_ctx(struct acc_option *options) + { +- struct uacce_dev_list *list, *tmp; ++ struct uacce_dev_list *list = NULL; ++ struct uacce_dev_list *tmp = NULL; + char *alg = options->algclass; +- int optype = options->optype; + int mode = options->syncmode; + struct uacce_dev *dev = NULL; +- int max_node, i; ++ int avail_ctx = 0; + char *dev_name; + int ret = 0; +- +- optype = optype % WD_DIR_MAX; +- max_node = numa_max_node() + 1; +- if (max_node <= 0) +- return -EINVAL; ++ int i = 0; + + list = wd_get_accel_list(alg); + if (!list) { +@@ -340,15 +336,11 @@ static int init_ctx_config(struct acc_option *options) + return -ENODEV; + } + +- if (strlen(options->device) == 0) { +- dev = list->dev; +- } else { +- for (tmp = list; tmp; tmp = tmp->next) { +- dev_name = strrchr(tmp->dev->dev_root, '/') + 1; +- if (!strcmp(dev_name, options->device)) { +- dev = tmp->dev; +- break; +- } ++ for (tmp = list; tmp != NULL; tmp = tmp->next) { ++ dev_name = strrchr(tmp->dev->dev_root, '/') + 1; ++ if (!strcmp(dev_name, options->device)) { ++ dev = tmp->dev; ++ break; + } + } + +@@ -358,29 +350,114 @@ static int init_ctx_config(struct acc_option *options) + goto free_list; + } + +- /* If there is no numa, we defualt config to zero */ +- if (dev->numa_id < 0) +- dev->numa_id = 0; +- +- memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); +- g_ctx_cfg.ctx_num = g_ctxnum; +- g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); +- if (!g_ctx_cfg.ctxs) { +- ret = -ENOMEM; ++ avail_ctx = wd_get_avail_ctx(dev); ++ if (avail_ctx < 0) { ++ ZIP_TST_PRT("failed to get the number of available ctx from %s\n", options->device); ++ ret = avail_ctx; ++ goto free_list; ++ } else if (avail_ctx < g_ctxnum) { ++ ZIP_TST_PRT("error: not enough ctx available in %s\n", options->device); ++ ret = -ENODEV; + goto free_list; + } + +- for (i = 0; i < g_ctxnum; i++) { ++ /* If there is no numa, we default config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ ++ for (; i < g_ctxnum; i++) { + g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); + if (!g_ctx_cfg.ctxs[i].ctx) { + ZIP_TST_PRT("failed to alloc %dth ctx\n", i); ++ ret = -ENOMEM; + goto free_ctx; + } +- +- g_ctx_cfg.ctxs[i].op_type = optype; ++ g_ctx_cfg.ctxs[i].op_type = 0; + g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; + } + ++ wd_free_list_accels(list); ++ return 0; ++ ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ ++free_list: ++ wd_free_list_accels(list); ++ ++ return ret; ++} ++ ++static int non_specified_device_request_ctx(struct acc_option *options) ++{ ++ char *alg = options->algclass; ++ int mode = options->syncmode; ++ struct uacce_dev *dev = NULL; ++ int ret = 0; ++ int i = 0; ++ ++ while (i < g_ctxnum) { ++ dev = wd_get_accel_dev(alg); ++ if (!dev) { ++ ZIP_TST_PRT("failed to get %s device\n", alg); ++ ret = -ENODEV; ++ goto free_ctx; ++ } ++ ++ /* If there is no numa, we default config to zero */ ++ if (dev->numa_id < 0) ++ dev->numa_id = 0; ++ ++ for (; i < g_ctxnum; i++) { ++ g_ctx_cfg.ctxs[i].ctx = wd_request_ctx(dev); ++ if (!g_ctx_cfg.ctxs[i].ctx) ++ break; ++ ++ g_ctx_cfg.ctxs[i].op_type = 0; ++ g_ctx_cfg.ctxs[i].ctx_mode = (__u8)mode; ++ } ++ ++ free(dev); ++ } ++ ++ return 0; ++ ++free_ctx: ++ for (; i >= 0; i--) ++ wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); ++ ++ return ret; ++} ++ ++static int init_ctx_config(struct acc_option *options) ++{ ++ int optype = options->optype; ++ int mode = options->syncmode; ++ int max_node; ++ int ret = 0; ++ ++ optype = optype % WD_DIR_MAX; ++ max_node = numa_max_node() + 1; ++ if (max_node <= 0) ++ return -EINVAL; ++ ++ memset(&g_ctx_cfg, 0, sizeof(struct wd_ctx_config)); ++ g_ctx_cfg.ctx_num = g_ctxnum; ++ g_ctx_cfg.ctxs = calloc(g_ctxnum, sizeof(struct wd_ctx)); ++ if (!g_ctx_cfg.ctxs) ++ return -ENOMEM; ++ ++ if (strlen(options->device) != 0) ++ ret = specified_device_request_ctx(options); ++ else ++ ret = non_specified_device_request_ctx(options); ++ ++ if (ret) { ++ ZIP_TST_PRT("failed to request zip ctx!\n"); ++ goto free_ctxs; ++ } ++ + g_sched = wd_sched_rr_alloc(SCHED_POLICY_RR, 2, max_node, wd_comp_poll_ctx); + if (!g_sched) { + ZIP_TST_PRT("failed to alloc sched!\n"); +@@ -394,7 +471,7 @@ static int init_ctx_config(struct acc_option *options) + * All contexts for 2 modes & 2 types. + * The test only uses one kind of contexts at the same time. + */ +- param.numa_id = dev->numa_id; ++ param.numa_id = 0; + param.type = optype; + param.mode = mode; + param.begin = 0; +@@ -407,24 +484,21 @@ static int init_ctx_config(struct acc_option *options) + + ret = wd_comp_init(&g_ctx_cfg, g_sched); + if (ret) { +- ZIP_TST_PRT("failed to cipher ctx!\n"); ++ ZIP_TST_PRT("failed to init zip ctx!\n"); + goto free_sched; + } + +- wd_free_list_accels(list); +- + return 0; + + free_sched: + wd_sched_rr_release(g_sched); + + free_ctx: +- for (; i >= 0; i--) ++ for (int i = g_ctxnum; i >= 0; i--) + wd_release_ctx(g_ctx_cfg.ctxs[i].ctx); +- free(g_ctx_cfg.ctxs); + +-free_list: +- wd_free_list_accels(list); ++free_ctxs: ++ free(g_ctx_cfg.ctxs); + + return ret; + } +-- +2.25.1 + diff --git a/0042-ecc-check-need_debug-before-calling-WD_DEBUG.patch b/0042-ecc-check-need_debug-before-calling-WD_DEBUG.patch new file mode 100644 index 0000000..fff8e56 --- /dev/null +++ b/0042-ecc-check-need_debug-before-calling-WD_DEBUG.patch @@ -0,0 +1,54 @@ +From ba54780c666c7f655cf6b18d0072e1e892656252 Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Wed, 3 Apr 2024 11:24:52 +0800 +Subject: [PATCH 42/44] ecc: check need_debug before calling WD_DEBUG + +Before calling WD_DEBUG, check whether debug logs need to be recorded +to prevent the syslog syscall from affecting the performance. + +Signed-off-by: Weili Qian +Signed-off-by: JiangShui Yang +--- + wd_ecc.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/wd_ecc.c b/wd_ecc.c +index e75bca0..65727e7 100644 +--- a/wd_ecc.c ++++ b/wd_ecc.c +@@ -997,20 +997,19 @@ static int fill_user_curve_cfg(struct wd_ecc_curve *param, + struct wd_ecc_sess_setup *setup) + { + struct wd_ecc_curve *src_param = setup->cv.cfg.pparam; +- __u32 curve_id; ++ bool need_debug = wd_need_debug(); ++ __u32 curve_id = 0; + int ret = 0; + + if (setup->cv.type == WD_CV_CFG_ID) { + curve_id = setup->cv.cfg.id; + ret = fill_param_by_id(param, setup->key_bits, curve_id); +- WD_DEBUG("set curve id %u!\n", curve_id); + } else if (setup->cv.type == WD_CV_CFG_PARAM) { + ret = set_key_cv(param, src_param); + if (ret) { + WD_ERR("failed to set key cv!\n"); + return ret; + } +- WD_DEBUG("set curve by user param!\n"); + } else { + WD_ERR("invalid: fill curve cfg type %u is error!\n", setup->cv.type); + return -WD_EINVAL; +@@ -1022,6 +1021,9 @@ static int fill_user_curve_cfg(struct wd_ecc_curve *param, + return -WD_EINVAL; + } + ++ if (need_debug) ++ WD_DEBUG("curve cfg type is %u, curve_id is %u!\n", setup->cv.type, curve_id); ++ + return ret; + } + +-- +2.25.1 + diff --git a/0043-uadk-remove-unused-ioctl-cmd.patch b/0043-uadk-remove-unused-ioctl-cmd.patch new file mode 100644 index 0000000..a0bfc71 --- /dev/null +++ b/0043-uadk-remove-unused-ioctl-cmd.patch @@ -0,0 +1,28 @@ +From b6aaaaf9eeb1061806c1a00faddd1ce91e5afa33 Mon Sep 17 00:00:00 2001 +From: Weili Qian +Date: Wed, 3 Apr 2024 11:24:53 +0800 +Subject: [PATCH 43/44] uadk: remove unused ioctl cmd + +Remove unused ioctl cmd UACCE_CMD_GET_SS_DMA. + +Signed-off-by: Weili Qian +Signed-off-by: JiangShui Yang +--- + include/uacce.h | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/include/uacce.h b/include/uacce.h +index fb3fb22..bb8d740 100644 +--- a/include/uacce.h ++++ b/include/uacce.h +@@ -15,7 +15,6 @@ extern "C" { + + #define UACCE_CMD_START _IO('W', 0) + #define UACCE_CMD_PUT_Q _IO('W', 1) +-#define UACCE_CMD_GET_SS_DMA _IOR('W', 100, unsigned long) + + /** + * UACCE Device flags: +-- +2.25.1 + diff --git a/0044-uadk-v1-remove-dummy.patch b/0044-uadk-v1-remove-dummy.patch new file mode 100644 index 0000000..ab6968b --- /dev/null +++ b/0044-uadk-v1-remove-dummy.patch @@ -0,0 +1,247 @@ +From deec45b9919adbdf968eae688003b96e69a77011 Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Wed, 3 Apr 2024 11:24:54 +0800 +Subject: [PATCH 44/44] uadk: v1: remove dummy + +dummy is no longer use, remove it. + +Signed-off-by: Wenkai Lin +Signed-off-by: JiangShui Yang +--- + Makefile.am | 1 - + v1/internal/wd_dummy_usr_if.h | 45 ------------ + v1/test/test_dummy.c | 126 ---------------------------------- + v1/wd_adapter.c | 13 ---- + 4 files changed, 185 deletions(-) + delete mode 100644 v1/internal/wd_dummy_usr_if.h + delete mode 100644 v1/test/test_dummy.c + +diff --git a/Makefile.am b/Makefile.am +index 68f3106..1049639 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -60,7 +60,6 @@ libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \ + v1/wd_bmm.c v1/wd_bmm.h \ + v1/wd_ecc.c v1/wd_ecc.h \ + v1/wd_sgl.c v1/wd_sgl.h \ +- v1/drv/dummy_drv.c v1/drv/dummy_drv.h \ + v1/drv/hisi_qm_udrv.c v1/drv/hisi_qm_udrv.h \ + v1/drv/hisi_zip_udrv.c v1/drv/hisi_zip_udrv.h \ + v1/drv/hisi_hpre_udrv.c v1/drv/hisi_hpre_udrv.h \ +diff --git a/v1/internal/wd_dummy_usr_if.h b/v1/internal/wd_dummy_usr_if.h +deleted file mode 100644 +index b5673ec..0000000 +--- a/v1/internal/wd_dummy_usr_if.h ++++ /dev/null +@@ -1,45 +0,0 @@ +-/* +- * Copyright 2019 Huawei Technologies Co.,Ltd.All rights reserved. +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-/* +- * This file defines the dummy algo interface between the user +- * and kernel space +- */ +- +-#ifndef __DUMMY_USR_IF_H +-#define __DUMMY_USR_IF_H +- +- +-/* Algorithm name */ +-#define AN_DUMMY_MEMCPY "memcopy" +- +-#define AAN_AFLAGS "aflags" +-#define AAN_MAX_COPY_SIZE "max_copy_size" +- +-struct wd_dummy_cpy_param { +- int flags; +- int max_copy_size; +-}; +- +-struct wd_dummy_cpy_msg { +- char *src_addr; +- char *tgt_addr; +- size_t size; +- void *ptr; +- __u32 ret; +-}; +- +-#endif +diff --git a/v1/test/test_dummy.c b/v1/test/test_dummy.c +deleted file mode 100644 +index 75ab33a..0000000 +--- a/v1/test/test_dummy.c ++++ /dev/null +@@ -1,126 +0,0 @@ +-/* +- * Copyright 2018-2019 Huawei Technologies Co.,Ltd.All rights reserved. +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-#include "../config.h" +-#include +-#include +-#include +-#include +-#include +- +-#include "wd_sched.h" +-#include "wd_dummy_usr_if.h" +-#include "dummy_hw_usr_if.h" +- +-#define CPSZ 4096 +- +-#define SYS_ERR_COND(cond, msg) if(cond) { \ +- perror(msg); \ +- exit(EXIT_FAILURE); } +- +-struct wd_dummy_cpy_msg *msgs; +- +-int wd_dummy_memcpy(struct wd_queue *q, void *dst, void *src, size_t size) +-{ +- struct wd_dummy_cpy_msg req, *resp; +- int ret; +- +- req.src_addr = src; +- req.tgt_addr = dst; +- req.size = size; +- +- ret = wd_send(q, (void *)&req); +- if (ret) +- return ret; +- +- return wd_recv_sync(q, (void **)&resp, 1000); +-} +- +-static void wd_dummy_sched_init_cache(struct wd_scheduler *sched, int i) +-{ +- sched->msgs[i].msg = &msgs[i]; +- msgs[i].src_addr = sched->msgs[i].data_in; +- msgs[i].tgt_addr = sched->msgs[i].data_out; +- msgs[i].size = sched->msg_data_size; +-} +- +-static int input_num = 10; +-static int wd_dummy_sched_input(struct wd_msg *msg, void *priv) +-{ +- SYS_ERR_COND(input_num <= 0, "input"); +- input_num--; +- memset(msg->data_in, '0'+input_num, CPSZ); +- memset(msg->data_out, 'x', CPSZ); +- +- return 0; +-} +- +-static int wd_dummy_sched_output(struct wd_msg *msg, void *priv) +-{ +- int i; +- char *in, *out; +- +- for (i = 0; i < CPSZ; i++) { +- in = (char *)msg->data_in; +- out = (char *)msg->data_out; +- if(in[i] != out[i]) { +- printf("verify result fail on %d\n", i); +- break; +- } +- +- } +- printf("verify result (%d) success (remained=%d)\n", in[0], input_num); +- +- return 0; +-} +- +-struct wd_scheduler sched = { +- .q_num = 1, +- .ss_region_size = 0, +- .msg_cache_num = 4, +- .msg_data_size = CPSZ, +- .init_cache = wd_dummy_sched_init_cache, +- .input = wd_dummy_sched_input, +- .output = wd_dummy_sched_output, +-}; +- +-int main(int argc, char *argv[]) +-{ +- int ret, i; +- int max_step = 20; +- +- sched.qs = calloc(sched.q_num, sizeof(*sched.qs)); +- SYS_ERR_COND(!sched.qs, "calloc"); +- +- msgs = calloc(sched.msg_cache_num, sizeof(*msgs)); +- SYS_ERR_COND(!msgs, "calloc"); +- +- for (i = 0; i < sched.q_num; i++) +- sched.qs[i].capa.alg = "memcpy"; +- +- ret = wd_sched_init(&sched); +- SYS_ERR_COND(ret, "wd_sched_init"); +- +- while(input_num || !wd_sched_empty(&sched)) { +- ret = wd_sched_work(&sched, input_num); +- SYS_ERR_COND(ret < 0, "wd_sched_work"); +- SYS_ERR_COND(max_step-- < 0, "max_step"); +- } +- +- wd_sched_fini(&sched); +- free(sched.qs); +- return EXIT_SUCCESS; +-} +diff --git a/v1/wd_adapter.c b/v1/wd_adapter.c +index d574200..b9b841d 100644 +--- a/v1/wd_adapter.c ++++ b/v1/wd_adapter.c +@@ -20,7 +20,6 @@ + + #include "config.h" + #include "v1/wd_util.h" +-#include "v1/drv/dummy_drv.h" + #include "v1/drv/hisi_qm_udrv.h" + #include "v1/drv/hisi_rng_udrv.h" + #include "v1/wd_adapter.h" +@@ -29,18 +28,6 @@ + #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) + + static const struct wd_drv_dio_if hw_dio_tbl[] = { { +- .hw_type = "dummy_v1", +- .open = dummy_set_queue_dio, +- .close = dummy_unset_queue_dio, +- .send = dummy_add_to_dio_q, +- .recv = dummy_get_from_dio_q, +- }, { +- .hw_type = "dummy_v2", +- .open = dummy_set_queue_dio, +- .close = dummy_unset_queue_dio, +- .send = dummy_add_to_dio_q, +- .recv = dummy_get_from_dio_q, +- }, { + .hw_type = HISI_QM_API_VER_BASE WD_UACCE_API_VER_NOIOMMU_SUBFIX, + .open = qm_init_queue, + .close = qm_uninit_queue, +-- +2.25.1 + diff --git a/0045-cipher-optimze-input-lengths-check.patch b/0045-cipher-optimze-input-lengths-check.patch new file mode 100644 index 0000000..b0cec55 --- /dev/null +++ b/0045-cipher-optimze-input-lengths-check.patch @@ -0,0 +1,114 @@ +From 4e1a4eb28f0e476cf4587d56b5cef4350b33ab82 Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Fri, 29 Mar 2024 16:53:04 +0800 +Subject: [PATCH 45/52] cipher: optimze input lengths check + +It is more reasonable to check the input lengths of various cipher +algorithms at the algorithm layer. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + drv/hisi_sec.c | 19 +++++-------------- + wd_cipher.c | 26 ++++++++++++++++++++++++++ + 2 files changed, 31 insertions(+), 14 deletions(-) + +diff --git a/drv/hisi_sec.c b/drv/hisi_sec.c +index 852340d..6625c41 100644 +--- a/drv/hisi_sec.c ++++ b/drv/hisi_sec.c +@@ -960,10 +960,9 @@ static void parse_cipher_bd2(struct hisi_qp *qp, struct hisi_sec_sqe *sqe, + dump_sec_msg(temp_msg, "cipher"); + } + +-static int aes_sm4_len_check(struct wd_cipher_msg *msg) ++static int aes_len_check(struct wd_cipher_msg *msg) + { +- if (msg->alg == WD_CIPHER_AES && +- msg->in_bytes <= AES_BLOCK_SIZE && ++ if (msg->in_bytes <= AES_BLOCK_SIZE && + (msg->mode == WD_CIPHER_CBC_CS1 || + msg->mode == WD_CIPHER_CBC_CS2 || + msg->mode == WD_CIPHER_CBC_CS3)) { +@@ -972,13 +971,6 @@ static int aes_sm4_len_check(struct wd_cipher_msg *msg) + return -WD_EINVAL; + } + +- if ((msg->in_bytes & (AES_BLOCK_SIZE - 1)) && +- (msg->mode == WD_CIPHER_CBC || msg->mode == WD_CIPHER_ECB)) { +- WD_ERR("failed to check input bytes of AES or SM4, size = %u\n", +- msg->in_bytes); +- return -WD_EINVAL; +- } +- + return 0; + } + +@@ -986,8 +978,7 @@ static int cipher_len_check(struct wd_cipher_msg *msg) + { + int ret; + +- if (msg->in_bytes > MAX_INPUT_DATA_LEN || +- !msg->in_bytes) { ++ if (msg->in_bytes > MAX_INPUT_DATA_LEN) { + WD_ERR("input cipher length is error, size = %u\n", + msg->in_bytes); + return -WD_EINVAL; +@@ -1016,8 +1007,8 @@ static int cipher_len_check(struct wd_cipher_msg *msg) + return 0; + } + +- if (msg->alg == WD_CIPHER_AES || msg->alg == WD_CIPHER_SM4) { +- ret = aes_sm4_len_check(msg); ++ if (msg->alg == WD_CIPHER_AES) { ++ ret = aes_len_check(msg); + if (ret) + return ret; + } +diff --git a/wd_cipher.c b/wd_cipher.c +index f35ce6f..279ca8b 100644 +--- a/wd_cipher.c ++++ b/wd_cipher.c +@@ -565,6 +565,28 @@ static int cipher_iv_len_check(struct wd_cipher_req *req, + return ret; + } + ++static int cipher_len_check(handle_t h_sess, struct wd_cipher_req *req) ++{ ++ struct wd_cipher_sess *sess = (struct wd_cipher_sess *)h_sess; ++ ++ if (!req->in_bytes) { ++ WD_ERR("invalid: cipher input length is zero!\n"); ++ return -WD_EINVAL; ++ } ++ ++ if (sess->alg != WD_CIPHER_AES && sess->alg != WD_CIPHER_SM4) ++ return 0; ++ ++ if ((req->in_bytes & (AES_BLOCK_SIZE - 1)) && ++ (sess->mode == WD_CIPHER_CBC || sess->mode == WD_CIPHER_ECB)) { ++ WD_ERR("failed to check input bytes of AES or SM4, size = %u\n", ++ req->in_bytes); ++ return -WD_EINVAL; ++ } ++ ++ return 0; ++} ++ + static int wd_cipher_check_params(handle_t h_sess, + struct wd_cipher_req *req, __u8 mode) + { +@@ -587,6 +609,10 @@ static int wd_cipher_check_params(handle_t h_sess, + return -WD_EINVAL; + } + ++ ret = cipher_len_check(h_sess, req); ++ if (unlikely(ret)) ++ return ret; ++ + ret = wd_check_src_dst(req->src, req->in_bytes, req->dst, req->out_bytes); + if (unlikely(ret)) { + WD_ERR("invalid: src/dst addr is NULL when src/dst size is non-zero!\n"); +-- +2.25.1 + diff --git a/0046-uadk-v1-improve-the-judgment-conditions-of-tag.patch b/0046-uadk-v1-improve-the-judgment-conditions-of-tag.patch new file mode 100644 index 0000000..eacd21e --- /dev/null +++ b/0046-uadk-v1-improve-the-judgment-conditions-of-tag.patch @@ -0,0 +1,54 @@ +From 628139bccaff2499d35cb530f54519f0aa744923 Mon Sep 17 00:00:00 2001 +From: Longfang Liu +Date: Fri, 29 Mar 2024 16:54:41 +0800 +Subject: [PATCH 46/52] uadk/v1: improve the judgment conditions of tag + +Before calling this function, it is guaranteed that the tag is +not empty. + +In addition, some alarm issues in hpre have been modified. + +Signed-off-by: Longfang Liu +Signed-off-by: Qi Tao +--- + v1/drv/hisi_hpre_udrv.c | 4 ++-- + v1/drv/hisi_sec_udrv.c | 3 +-- + 2 files changed, 3 insertions(+), 4 deletions(-) + +diff --git a/v1/drv/hisi_hpre_udrv.c b/v1/drv/hisi_hpre_udrv.c +index de614f2..eaee4b1 100644 +--- a/v1/drv/hisi_hpre_udrv.c ++++ b/v1/drv/hisi_hpre_udrv.c +@@ -212,13 +212,13 @@ static int qm_fill_rsa_pubkey(struct wcrypto_rsa_pubkey *pubkey, void **data) + wd_e->bsize, wd_e->dsize, "rsa pubkey e"); + if (unlikely(ret)) + return ret; +- wd_e->dsize = wd_e->dsize; ++ wd_e->dsize = wd_e->bsize; + + ret = qm_crypto_bin_to_hpre_bin(wd_n->data, (const char *)wd_n->data, + wd_n->bsize, wd_n->dsize, "rsa pubkey n"); + if (unlikely(ret)) + return ret; +- wd_n->dsize = wd_n->dsize; ++ wd_n->dsize = wd_n->bsize; + + *data = wd_e->data; + return (int)(wd_n->bsize + wd_e->bsize); +diff --git a/v1/drv/hisi_sec_udrv.c b/v1/drv/hisi_sec_udrv.c +index d046327..c0bd73d 100644 +--- a/v1/drv/hisi_sec_udrv.c ++++ b/v1/drv/hisi_sec_udrv.c +@@ -759,8 +759,7 @@ static int fill_cipher_bd2(struct wd_queue *q, struct hisi_sec_sqe *sqe, + return ret; + } + +- if (tag) +- sqe->type2.tag = tag->wcrypto_tag.ctx_id; ++ sqe->type2.tag = tag->wcrypto_tag.ctx_id; + + return ret; + } +-- +2.25.1 + diff --git a/0047-uadk-v1-fix-for-sec-cipher-bd1-ci_gen-configuration.patch b/0047-uadk-v1-fix-for-sec-cipher-bd1-ci_gen-configuration.patch new file mode 100644 index 0000000..ab3160a --- /dev/null +++ b/0047-uadk-v1-fix-for-sec-cipher-bd1-ci_gen-configuration.patch @@ -0,0 +1,39 @@ +From f59a72aefeb714c95bddca71431e95746094d6f7 Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Fri, 29 Mar 2024 16:56:43 +0800 +Subject: [PATCH 47/52] uadk/v1: fix for sec cipher bd1 ci_gen configuration + +In storage scenarios, the XTS mode is used for encrypting and decrypting +data on and off disks. According to the definition of this mode, the input +parameter to genarate IV is the LBA, so update SEC bd1 xts mode CI_GEN +from 0 to 3, which means use LBA mode. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + v1/drv/hisi_sec_udrv.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/v1/drv/hisi_sec_udrv.c b/v1/drv/hisi_sec_udrv.c +index c0bd73d..d4e090a 100644 +--- a/v1/drv/hisi_sec_udrv.c ++++ b/v1/drv/hisi_sec_udrv.c +@@ -312,11 +312,10 @@ static int fill_cipher_bd1_type(struct wcrypto_cipher_msg *msg, + + fill_bd_addr_type(msg->data_fmt, sqe); + +- /* +- * BD1 cipher only provides ci_gen=0 for compatibility, so user +- * should prepare iv[gran_num] and iv_bytes is sum of all grans +- */ +- sqe->type1.ci_gen = CI_GEN_BY_ADDR; ++ if (msg->mode == WCRYPTO_CIPHER_XTS) ++ sqe->type1.ci_gen = CI_GEN_BY_LBA; ++ else ++ sqe->type1.ci_gen = CI_GEN_BY_ADDR; + + return WD_SUCCESS; + } +-- +2.25.1 + diff --git a/0048-uadk-fix-for-shmget-shmflag.patch b/0048-uadk-fix-for-shmget-shmflag.patch new file mode 100644 index 0000000..bb23b44 --- /dev/null +++ b/0048-uadk-fix-for-shmget-shmflag.patch @@ -0,0 +1,43 @@ +From 0fc17f5c160cb6ea2d1f4b08e9884f29ff75b2dc Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Fri, 29 Mar 2024 16:58:23 +0800 +Subject: [PATCH 48/52] uadk: fix for shmget shmflag + +The shmflag should be 0600 in octal, not 600 in decimal. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + uadk_tool/dfx/uadk_dfx.c | 2 +- + wd_util.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/uadk_tool/dfx/uadk_dfx.c b/uadk_tool/dfx/uadk_dfx.c +index 796135a..9c54b7b 100644 +--- a/uadk_tool/dfx/uadk_dfx.c ++++ b/uadk_tool/dfx/uadk_dfx.c +@@ -16,7 +16,7 @@ + + #define uadk_build_date() printf("built on: %s %s\n", __DATE__, __TIME__) + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +-#define PRIVILEGE_FLAG 666 ++#define PRIVILEGE_FLAG 0666 + + struct uadk_env_var { + const char *module; +diff --git a/wd_util.c b/wd_util.c +index fb58167..2635dc3 100644 +--- a/wd_util.c ++++ b/wd_util.c +@@ -19,7 +19,7 @@ + #define WD_BALANCE_THRHD 1280 + #define WD_RECV_MAX_CNT_SLEEP 60000000 + #define WD_RECV_MAX_CNT_NOSLEEP 200000000 +-#define PRIVILEGE_FLAG 600 ++#define PRIVILEGE_FLAG 0600 + #define MIN(a, b) ((a) > (b) ? (b) : (a)) + #define MAX(a, b) ((a) > (b) ? (a) : (b)) + +-- +2.25.1 + diff --git a/0049-sec-optimze-for-directly-assigning-values-to-structu.patch b/0049-sec-optimze-for-directly-assigning-values-to-structu.patch new file mode 100644 index 0000000..016f5c4 --- /dev/null +++ b/0049-sec-optimze-for-directly-assigning-values-to-structu.patch @@ -0,0 +1,100 @@ +From 705e33d624defc335cdb1c96335da684868858a3 Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Fri, 29 Mar 2024 16:59:34 +0800 +Subject: [PATCH 49/52] sec: optimze for directly assigning values to + structures + +It is more reasonable to use pointers for value assignment. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + drv/hisi_sec.c | 36 ++++++++++++------------------------ + 1 file changed, 12 insertions(+), 24 deletions(-) + +diff --git a/drv/hisi_sec.c b/drv/hisi_sec.c +index 6625c41..b218cd8 100644 +--- a/drv/hisi_sec.c ++++ b/drv/hisi_sec.c +@@ -542,66 +542,54 @@ static int hisi_sec_aead_recv_v3(struct wd_alg_driver *drv, handle_t ctx, void * + + static int cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *msg) + { +- handle_t h_qp = (handle_t)wd_ctx_get_priv(ctx); +- struct hisi_qp *qp = (struct hisi_qp *)h_qp; +- struct hisi_qm_queue_info q_info = qp->q_info; ++ struct hisi_qp *qp = (struct hisi_qp *)wd_ctx_get_priv(ctx); + +- if (q_info.hw_type == HISI_QM_API_VER2_BASE) ++ if (qp->q_info.hw_type == HISI_QM_API_VER2_BASE) + return hisi_sec_cipher_send(drv, ctx, msg); + return hisi_sec_cipher_send_v3(drv, ctx, msg); + } + + static int cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg) + { +- handle_t h_qp = (handle_t)wd_ctx_get_priv(ctx); +- struct hisi_qp *qp = (struct hisi_qp *)h_qp; +- struct hisi_qm_queue_info q_info = qp->q_info; ++ struct hisi_qp *qp = (struct hisi_qp *)wd_ctx_get_priv(ctx); + +- if (q_info.hw_type == HISI_QM_API_VER2_BASE) ++ if (qp->q_info.hw_type == HISI_QM_API_VER2_BASE) + return hisi_sec_cipher_recv(drv, ctx, msg); + return hisi_sec_cipher_recv_v3(drv, ctx, msg); + } + + static int digest_send(struct wd_alg_driver *drv, handle_t ctx, void *msg) + { +- handle_t h_qp = (handle_t)wd_ctx_get_priv(ctx); +- struct hisi_qp *qp = (struct hisi_qp *)h_qp; +- struct hisi_qm_queue_info q_info = qp->q_info; ++ struct hisi_qp *qp = (struct hisi_qp *)wd_ctx_get_priv(ctx); + +- if (q_info.hw_type == HISI_QM_API_VER2_BASE) ++ if (qp->q_info.hw_type == HISI_QM_API_VER2_BASE) + return hisi_sec_digest_send(drv, ctx, msg); + return hisi_sec_digest_send_v3(drv, ctx, msg); + } + + static int digest_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg) + { +- handle_t h_qp = (handle_t)wd_ctx_get_priv(ctx); +- struct hisi_qp *qp = (struct hisi_qp *)h_qp; +- struct hisi_qm_queue_info q_info = qp->q_info; ++ struct hisi_qp *qp = (struct hisi_qp *)wd_ctx_get_priv(ctx); + +- if (q_info.hw_type == HISI_QM_API_VER2_BASE) ++ if (qp->q_info.hw_type == HISI_QM_API_VER2_BASE) + return hisi_sec_digest_recv(drv, ctx, msg); + return hisi_sec_digest_recv_v3(drv, ctx, msg); + } + + static int aead_send(struct wd_alg_driver *drv, handle_t ctx, void *msg) + { +- handle_t h_qp = (handle_t)wd_ctx_get_priv(ctx); +- struct hisi_qp *qp = (struct hisi_qp *)h_qp; +- struct hisi_qm_queue_info q_info = qp->q_info; ++ struct hisi_qp *qp = (struct hisi_qp *)wd_ctx_get_priv(ctx); + +- if (q_info.hw_type == HISI_QM_API_VER2_BASE) ++ if (qp->q_info.hw_type == HISI_QM_API_VER2_BASE) + return hisi_sec_aead_send(drv, ctx, msg); + return hisi_sec_aead_send_v3(drv, ctx, msg); + } + + static int aead_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg) + { +- handle_t h_qp = (handle_t)wd_ctx_get_priv(ctx); +- struct hisi_qp *qp = (struct hisi_qp *)h_qp; +- struct hisi_qm_queue_info q_info = qp->q_info; ++ struct hisi_qp *qp = (struct hisi_qp *)wd_ctx_get_priv(ctx); + +- if (q_info.hw_type == HISI_QM_API_VER2_BASE) ++ if (qp->q_info.hw_type == HISI_QM_API_VER2_BASE) + return hisi_sec_aead_recv(drv, ctx, msg); + return hisi_sec_aead_recv_v3(drv, ctx, msg); + } +-- +2.25.1 + diff --git a/0050-util-optimize-for-wd_handle_msg_sync.patch b/0050-util-optimize-for-wd_handle_msg_sync.patch new file mode 100644 index 0000000..b38f6d2 --- /dev/null +++ b/0050-util-optimize-for-wd_handle_msg_sync.patch @@ -0,0 +1,61 @@ +From f36aa5f7e8f82a90aa0cb729bf00cc51f76970d5 Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Fri, 29 Mar 2024 17:01:03 +0800 +Subject: [PATCH 50/52] util: optimize for wd_handle_msg_sync + +1. Separate rx_cnt auto-increment and judgment. +2. Reduce the condition judgment in the case of eagain. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + wd_util.c | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +diff --git a/wd_util.c b/wd_util.c +index 2635dc3..0744ff0 100644 +--- a/wd_util.c ++++ b/wd_util.c +@@ -1822,24 +1822,28 @@ int wd_handle_msg_sync(struct wd_alg_driver *drv, struct wd_msg_handle *msg_hand + do { + if (epoll_en) { + ret = wd_ctx_wait(ctx, POLL_TIME); +- if (ret < 0) ++ if (unlikely(ret < 0)) + WD_ERR("wd ctx wait timeout(%d)!\n", ret); + } + + ret = msg_handle->recv(drv, ctx, msg); +- if (ret == -WD_EAGAIN) { +- if (unlikely(rx_cnt++ >= timeout)) { +- WD_ERR("failed to recv msg: timeout!\n"); +- return -WD_ETIMEDOUT; ++ if (ret != -WD_EAGAIN) { ++ if (unlikely(ret < 0)) { ++ WD_ERR("failed to recv msg: error = %d!\n", ret); ++ return ret; + } ++ break; ++ } + +- if (balance && *balance > WD_BALANCE_THRHD) +- usleep(1); +- } else if (unlikely(ret < 0)) { +- WD_ERR("failed to recv msg: error = %d!\n", ret); +- return ret; ++ rx_cnt++; ++ if (unlikely(rx_cnt >= timeout)) { ++ WD_ERR("failed to recv msg: timeout!\n"); ++ return -WD_ETIMEDOUT; + } +- } while (ret < 0); ++ ++ if (balance && *balance > WD_BALANCE_THRHD) ++ usleep(1); ++ } while (1); + + if (balance) + *balance = rx_cnt; +-- +2.25.1 + diff --git a/0051-uadk-drv_hisi-optimize-qm-recv-function.patch b/0051-uadk-drv_hisi-optimize-qm-recv-function.patch new file mode 100644 index 0000000..b070b8e --- /dev/null +++ b/0051-uadk-drv_hisi-optimize-qm-recv-function.patch @@ -0,0 +1,115 @@ +From ace1da03900d04a1e14d61200a89c539ff78856d Mon Sep 17 00:00:00 2001 +From: Wenkai Lin +Date: Fri, 29 Mar 2024 17:02:23 +0800 +Subject: [PATCH 51/52] uadk: drv_hisi - optimize qm recv function + +Ensure that the value written by the hardware is +read from the memory each time, reduce the number +of packet receiving times by half. +Also sqe address is only need calculated when packets +are received. + +Signed-off-by: Wenkai Lin +Signed-off-by: Qi Tao +--- + drv/hisi_qm_udrv.c | 45 +++++++++++++++++++++++---------------------- + 1 file changed, 23 insertions(+), 22 deletions(-) + +diff --git a/drv/hisi_qm_udrv.c b/drv/hisi_qm_udrv.c +index d8b5271..304764e 100644 +--- a/drv/hisi_qm_udrv.c ++++ b/drv/hisi_qm_udrv.c +@@ -21,8 +21,8 @@ + #define QM_DBELL_SQN_MASK 0x3ff + #define QM_DBELL_CMD_MASK 0xf + #define QM_Q_DEPTH 1024 +-#define CQE_PHASE(cq) (__le16_to_cpu((cq)->w7) & 0x1) +-#define CQE_SQ_HEAD_INDEX(cq) (__le16_to_cpu((cq)->sq_head) & 0xffff) ++#define CQE_PHASE(cqe) (__le16_to_cpu((cqe)->w7) & 0x1) ++#define CQE_SQ_HEAD_INDEX(cqe) (__le16_to_cpu((cqe)->sq_head) & 0xffff) + #define VERSION_ID_SHIFT 9 + + #define UACCE_CMD_QM_SET_QP_CTX _IOWR('H', 10, struct hisi_qp_ctx) +@@ -505,32 +505,33 @@ int hisi_qm_send(handle_t h_qp, const void *req, __u16 expect, __u16 *count) + return 0; + } + +-static int hisi_qm_recv_single(struct hisi_qm_queue_info *q_info, void *resp) ++static int hisi_qm_recv_single(struct hisi_qm_queue_info *q_info, handle_t h_ctx, ++ void *resp, __u16 idx) + { +- struct hisi_qp *qp = container_of(q_info, struct hisi_qp, q_info); ++ __u16 i, j, cqe_phase; + struct cqe *cqe; +- __u16 i, j; + + pthread_spin_lock(&q_info->rv_lock); + i = q_info->cq_head_index; + cqe = q_info->cq_base + i * sizeof(struct cqe); ++ cqe_phase = CQE_PHASE(cqe); ++ /* Use dsb to read from memory and improve the receiving efficiency. */ ++ rmb(); + +- if (q_info->cqc_phase == CQE_PHASE(cqe)) { +- /* Make sure cqe valid bit is set */ +- rmb(); +- j = CQE_SQ_HEAD_INDEX(cqe); +- if (unlikely(j >= q_info->sq_depth)) { +- pthread_spin_unlock(&q_info->rv_lock); +- WD_DEV_ERR(qp->h_ctx, "CQE_SQ_HEAD_INDEX(%u) error!\n", j); +- return -WD_EIO; +- } +- memcpy(resp, (void *)((uintptr_t)q_info->sq_base + +- j * q_info->sqe_size), q_info->sqe_size); +- } else { ++ if (q_info->cqc_phase != cqe_phase) { + pthread_spin_unlock(&q_info->rv_lock); + return -WD_EAGAIN; + } + ++ j = CQE_SQ_HEAD_INDEX(cqe); ++ if (unlikely(j >= q_info->sq_depth)) { ++ pthread_spin_unlock(&q_info->rv_lock); ++ WD_DEV_ERR(h_ctx, "CQE_SQ_HEAD_INDEX(%u) error!\n", j); ++ return -WD_EIO; ++ } ++ memcpy((void *)((uintptr_t)resp + idx * q_info->sqe_size), ++ (void *)((uintptr_t)q_info->sq_base + j * q_info->sqe_size), q_info->sqe_size); ++ + if (i == q_info->cq_depth - 1) { + q_info->cqc_phase = !(q_info->cqc_phase); + i = 0; +@@ -544,7 +545,7 @@ static int hisi_qm_recv_single(struct hisi_qm_queue_info *q_info, void *resp) + */ + if (unlikely(wd_ioread32(q_info->ds_rx_base) == 1)) { + pthread_spin_unlock(&q_info->rv_lock); +- WD_DEV_ERR(qp->h_ctx, "wd queue hw error happened after qm receive!\n"); ++ WD_DEV_ERR(h_ctx, "wd queue hw error happened before qm receive!\n"); + return -WD_HW_EACCESS; + } + +@@ -565,8 +566,9 @@ int hisi_qm_recv(handle_t h_qp, void *resp, __u16 expect, __u16 *count) + { + struct hisi_qp *qp = (struct hisi_qp *)h_qp; + struct hisi_qm_queue_info *q_info; +- int recv_num = 0; +- int i, ret, offset; ++ __u16 recv_num = 0; ++ __u16 i; ++ int ret; + + if (unlikely(!resp || !qp || !count)) + return -WD_EINVAL; +@@ -581,8 +583,7 @@ int hisi_qm_recv(handle_t h_qp, void *resp, __u16 expect, __u16 *count) + } + + for (i = 0; i < expect; i++) { +- offset = i * q_info->sqe_size; +- ret = hisi_qm_recv_single(q_info, resp + offset); ++ ret = hisi_qm_recv_single(q_info, qp->h_ctx, resp, i); + if (ret) + break; + recv_num++; +-- +2.25.1 + diff --git a/0052-uadk-modify-uadk-static-compile.patch b/0052-uadk-modify-uadk-static-compile.patch new file mode 100644 index 0000000..00b7cf3 --- /dev/null +++ b/0052-uadk-modify-uadk-static-compile.patch @@ -0,0 +1,1307 @@ +From a282605e6550b5572072f9968370fd01502a04f5 Mon Sep 17 00:00:00 2001 +From: Longfang Liu +Date: Fri, 29 Mar 2024 17:04:01 +0800 +Subject: [PATCH 52/52] uadk: modify uadk static compile + +After the UADK framework supports dynamic loading. Device drivers are +all default used in the form of dynamic libraries. + +Static compilation requires static declaration and cannot declare +unknown device drivers. Therefore, static compilation only supports +HiSilicon device drivers. + +Signed-off-by: Longfang Liu +Signed-off-by: Qi Tao +--- + drv/hisi_comp.c | 8 ++++++ + drv/hisi_hpre.c | 9 ++++++ + drv/hisi_sec.c | 8 ++++++ + include/wd_alg.h | 26 +++++++++++++---- + include/wd_alg_common.h | 17 +++++++---- + wd_aead.c | 61 ++++++++++++++++++++++++++------------- + wd_alg.c | 25 +++++++++++++++- + wd_cipher.c | 61 ++++++++++++++++++++++++++------------- + wd_comp.c | 63 ++++++++++++++++++++++++++++------------- + wd_dh.c | 61 ++++++++++++++++++++++++++------------- + wd_digest.c | 63 ++++++++++++++++++++++++++++------------- + wd_ecc.c | 61 ++++++++++++++++++++++++++------------- + wd_rsa.c | 61 ++++++++++++++++++++++++++------------- + 13 files changed, 377 insertions(+), 147 deletions(-) + +diff --git a/drv/hisi_comp.c b/drv/hisi_comp.c +index a1af567..2fa5eff 100644 +--- a/drv/hisi_comp.c ++++ b/drv/hisi_comp.c +@@ -1109,7 +1109,11 @@ static struct wd_alg_driver zip_alg_driver[] = { + GEN_ZIP_ALG_DRIVER("lz77_zstd"), + }; + ++#ifdef WD_STATIC_DRV ++void hisi_zip_probe(void) ++#else + static void __attribute__((constructor)) hisi_zip_probe(void) ++#endif + { + int alg_num = ARRAY_SIZE(zip_alg_driver); + int i, ret; +@@ -1124,7 +1128,11 @@ static void __attribute__((constructor)) hisi_zip_probe(void) + } + } + ++#ifdef WD_STATIC_DRV ++void hisi_zip_remove(void) ++#else + static void __attribute__((destructor)) hisi_zip_remove(void) ++#endif + { + int alg_num = ARRAY_SIZE(zip_alg_driver); + int i; +diff --git a/drv/hisi_hpre.c b/drv/hisi_hpre.c +index babc795..68a11ae 100644 +--- a/drv/hisi_hpre.c ++++ b/drv/hisi_hpre.c +@@ -1,3 +1,4 @@ ++ + /* SPDX-License-Identifier: Apache-2.0 */ + /* Copyright 2020-2021 Huawei Technologies Co.,Ltd. All rights reserved. */ + +@@ -2547,7 +2548,11 @@ static struct wd_alg_driver hpre_dh_driver = { + .get_usage = hpre_get_usage, + }; + ++#ifdef WD_STATIC_DRV ++void hisi_hpre_probe(void) ++#else + static void __attribute__((constructor)) hisi_hpre_probe(void) ++#endif + { + __u32 alg_num = ARRAY_SIZE(hpre_ecc_driver); + __u32 i; +@@ -2569,7 +2574,11 @@ static void __attribute__((constructor)) hisi_hpre_probe(void) + } + } + ++#ifdef WD_STATIC_DRV ++void hisi_hpre_remove(void) ++#else + static void __attribute__((destructor)) hisi_hpre_remove(void) ++#endif + { + __u32 alg_num = ARRAY_SIZE(hpre_ecc_driver); + __u32 i; +diff --git a/drv/hisi_sec.c b/drv/hisi_sec.c +index b218cd8..aba4185 100644 +--- a/drv/hisi_sec.c ++++ b/drv/hisi_sec.c +@@ -3087,7 +3087,11 @@ static void hisi_sec_exit(struct wd_alg_driver *drv) + drv->priv = NULL; + } + ++#ifdef WD_STATIC_DRV ++void hisi_sec2_probe(void) ++#else + static void __attribute__((constructor)) hisi_sec2_probe(void) ++#endif + { + int alg_num; + int i, ret; +@@ -3119,7 +3123,11 @@ static void __attribute__((constructor)) hisi_sec2_probe(void) + } + } + ++#ifdef WD_STATIC_DRV ++void hisi_sec2_remove(void) ++#else + static void __attribute__((destructor)) hisi_sec2_remove(void) ++#endif + { + int alg_num; + int i; +diff --git a/include/wd_alg.h b/include/wd_alg.h +index 861b7d9..1735896 100644 +--- a/include/wd_alg.h ++++ b/include/wd_alg.h +@@ -69,7 +69,7 @@ enum alg_dev_type { + UADK_ALG_HW = 0x3 + }; + +-/** ++/* + * @drv_name: name of the current device driver + * @alg_name: name of the algorithm supported by the driver + * @priority: priority of the type of algorithm supported by the driver +@@ -133,7 +133,7 @@ inline int wd_alg_driver_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg + return drv->recv(drv, ctx, msg); + } + +-/** ++/* + * wd_alg_driver_register() - Register a device driver. + * @wd_alg_driver: a device driver that supports an algorithm. + * +@@ -142,7 +142,7 @@ inline int wd_alg_driver_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg + int wd_alg_driver_register(struct wd_alg_driver *drv); + void wd_alg_driver_unregister(struct wd_alg_driver *drv); + +-/** ++/* + * @alg_name: name of the algorithm supported by the driver + * @drv_name: name of the current device driver + * @available: Indicates whether the current driver still has resources available +@@ -165,7 +165,7 @@ struct wd_alg_list { + struct wd_alg_list *next; + }; + +-/** ++/* + * wd_request_drv() - Apply for an algorithm driver. + * @alg_name: task algorithm name. + * @hw_mask: the flag of shield hardware device drivers. +@@ -175,7 +175,7 @@ struct wd_alg_list { + struct wd_alg_driver *wd_request_drv(const char *alg_name, bool hw_mask); + void wd_release_drv(struct wd_alg_driver *drv); + +-/** ++/* + * wd_drv_alg_support() - Check the algorithms supported by the driver. + * @alg_name: task algorithm name. + * @drv: a device driver that supports an algorithm. +@@ -185,7 +185,7 @@ void wd_release_drv(struct wd_alg_driver *drv); + bool wd_drv_alg_support(const char *alg_name, + struct wd_alg_driver *drv); + +-/** ++/* + * wd_enable_drv() - Re-enable use of the current device driver. + * @drv: a device driver that supports an algorithm. + */ +@@ -194,6 +194,20 @@ void wd_disable_drv(struct wd_alg_driver *drv); + + struct wd_alg_list *wd_get_alg_head(void); + ++#ifdef WD_STATIC_DRV ++/* ++ * duplicate drivers will be skipped when it register to alg_list ++ */ ++void hisi_sec2_probe(void); ++void hisi_hpre_probe(void); ++void hisi_zip_probe(void); ++ ++void hisi_sec2_remove(void); ++void hisi_hpre_remove(void); ++void hisi_zip_remove(void); ++ ++#endif ++ + #ifdef __cplusplus + } + #endif +diff --git a/include/wd_alg_common.h b/include/wd_alg_common.h +index 32b8630..1235f1d 100644 +--- a/include/wd_alg_common.h ++++ b/include/wd_alg_common.h +@@ -55,7 +55,12 @@ enum wd_ctx_mode { + CTX_MODE_MAX, + }; + +-/** ++enum wd_init_type { ++ WD_TYPE_V1, ++ WD_TYPE_V2, ++}; ++ ++/* + * struct wd_ctx - Define one ctx and related type. + * @ctx: The ctx itself. + * @op_type: Define the operation type of this specific ctx. +@@ -69,7 +74,7 @@ struct wd_ctx { + __u8 ctx_mode; + }; + +-/** ++/* + * struct wd_cap_config - Capabilities. + * @ctx_msg_num: number of asynchronous msg pools that the user wants to allocate. + * Optional, user can set ctx_msg_num based on the number of requests +@@ -82,7 +87,7 @@ struct wd_cap_config { + __u32 resv; + }; + +-/** ++/* + * struct wd_ctx_config - Define a ctx set and its related attributes, which + * will be used in the scope of current process. + * @ctx_num: The ctx number in below ctx array. +@@ -98,7 +103,7 @@ struct wd_ctx_config { + struct wd_cap_config *cap; + }; + +-/** ++/* + * struct wd_ctx_nums - Define the ctx sets numbers. + * @sync_ctx_num: The ctx numbers which are used for sync mode for each + * ctx sets. +@@ -110,7 +115,7 @@ struct wd_ctx_nums { + __u32 async_ctx_num; + }; + +-/** ++/* + * struct wd_ctx_params - Define the ctx sets params which are used for init + * algorithms. + * @op_type_num: Used for index of ctx_set_num, the order is the same as +@@ -144,7 +149,7 @@ struct wd_ctx_config_internal { + unsigned long *msg_cnt; + }; + +-/** ++/* + * struct wd_comp_sched - Define a scheduler. + * @name: Name of this scheduler. + * @sched_policy: Method for scheduler to perform scheduling +diff --git a/wd_aead.c b/wd_aead.c +index 57daa80..daed761 100644 +--- a/wd_aead.c ++++ b/wd_aead.c +@@ -62,22 +62,48 @@ struct wd_aead_sess { + struct wd_env_config wd_aead_env_config; + static struct wd_init_attrs wd_aead_init_attrs; + +-static void wd_aead_close_driver(void) ++static void wd_aead_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_aead_setting.dlh_list); ++ return; ++ } ++ + if (wd_aead_setting.dlhandle) { + wd_release_drv(wd_aead_setting.driver); + dlclose(wd_aead_setting.dlhandle); + wd_aead_setting.dlhandle = NULL; + } ++#else ++ wd_release_drv(wd_aead_setting.driver); ++ hisi_sec2_remove(); ++#endif + } + +-static int wd_aead_open_driver(void) ++static int wd_aead_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; + const char *alg_name = "gcm(aes)"; ++#ifndef WD_STATIC_DRV + char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open tham by wd_dlopen_drv() ++ * use NULL means dynamic query path ++ */ ++ wd_aead_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_aead_setting.dlh_list) { ++ WD_ERR("fail to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_sec.so", lib_path, false); + if (ret) + return ret; +@@ -87,17 +113,21 @@ static int wd_aead_open_driver(void) + WD_ERR("failed to open libhisi_sec.so, %s\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_sec2_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_aead_close_driver(); ++ wd_aead_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support\n", alg_name); + return -WD_EINVAL; + } + + wd_aead_setting.driver = driver; + +- return 0; ++ return WD_SUCCESS; + } + + static int aes_key_len_check(__u32 length) +@@ -466,7 +496,7 @@ int wd_aead_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_aead_open_driver(); ++ ret = wd_aead_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -479,7 +509,7 @@ int wd_aead_init(struct wd_ctx_config *config, struct wd_sched *sched) + return 0; + + out_close_driver: +- wd_aead_close_driver(); ++ wd_aead_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_aead_setting.status); + return ret; +@@ -509,7 +539,7 @@ void wd_aead_uninit(void) + if (ret) + return; + +- wd_aead_close_driver(); ++ wd_aead_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_aead_setting.status); + } + +@@ -551,16 +581,9 @@ int wd_aead_init2_(char *alg, __u32 sched_type, int task_type, + goto out_uninit; + } + +- /* +- * Driver lib file path could set by env param. +- * then open them by wd_dlopen_drv() +- * use NULL means dynamic query path +- */ +- wd_aead_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_aead_setting.dlh_list) { +- WD_ERR("failed to open driver lib files.\n"); ++ state = wd_aead_open_driver(WD_TYPE_V2); ++ if (state) + goto out_uninit; +- } + + while (ret != 0) { + memset(&wd_aead_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -613,7 +636,7 @@ out_params_uninit: + out_driver: + wd_alg_drv_unbind(wd_aead_setting.driver); + out_dlopen: +- wd_dlclose_drv(wd_aead_setting.dlh_list); ++ wd_aead_close_driver(WD_TYPE_V2); + out_uninit: + wd_alg_clear_init(&wd_aead_setting.status); + return ret; +@@ -629,7 +652,7 @@ void wd_aead_uninit2(void) + + wd_alg_attrs_uninit(&wd_aead_init_attrs); + wd_alg_drv_unbind(wd_aead_setting.driver); +- wd_dlclose_drv(wd_aead_setting.dlh_list); ++ wd_aead_close_driver(WD_TYPE_V2); + wd_aead_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_aead_setting.status); + } +diff --git a/wd_alg.c b/wd_alg.c +index f34a407..0a15fe8 100644 +--- a/wd_alg.c ++++ b/wd_alg.c +@@ -150,6 +150,26 @@ static bool wd_alg_driver_match(struct wd_alg_driver *drv, + return true; + } + ++static bool wd_alg_repeat_check(struct wd_alg_driver *drv) ++{ ++ struct wd_alg_list *npre = &alg_list_head; ++ struct wd_alg_list *pnext = NULL; ++ ++ pthread_mutex_lock(&mutex); ++ pnext = npre->next; ++ while (pnext) { ++ if (wd_alg_driver_match(drv, pnext)) { ++ pthread_mutex_unlock(&mutex); ++ return true; ++ } ++ npre = pnext; ++ pnext = pnext->next; ++ } ++ pthread_mutex_unlock(&mutex); ++ ++ return false; ++} ++ + int wd_alg_driver_register(struct wd_alg_driver *drv) + { + struct wd_alg_list *new_alg; +@@ -164,6 +184,9 @@ int wd_alg_driver_register(struct wd_alg_driver *drv) + return -WD_EINVAL; + } + ++ if (wd_alg_repeat_check(drv)) ++ return 0; ++ + new_alg = calloc(1, sizeof(struct wd_alg_list)); + if (!new_alg) { + WD_ERR("failed to alloc alg driver memory!\n"); +@@ -238,7 +261,7 @@ bool wd_drv_alg_support(const char *alg_name, + struct wd_alg_list *head = &alg_list_head; + struct wd_alg_list *pnext = head->next; + +- if (!alg_name) ++ if (!alg_name || !drv) + return false; + + while (pnext) { +diff --git a/wd_cipher.c b/wd_cipher.c +index 279ca8b..9b6e884 100644 +--- a/wd_cipher.c ++++ b/wd_cipher.c +@@ -72,22 +72,48 @@ struct wd_cipher_sess { + struct wd_env_config wd_cipher_env_config; + static struct wd_init_attrs wd_cipher_init_attrs; + +-static void wd_cipher_close_driver(void) ++static void wd_cipher_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_cipher_setting.dlh_list); ++ return; ++ } ++ + if (wd_cipher_setting.dlhandle) { + wd_release_drv(wd_cipher_setting.driver); + dlclose(wd_cipher_setting.dlhandle); + wd_cipher_setting.dlhandle = NULL; + } ++#else ++ wd_release_drv(wd_cipher_setting.driver); ++ hisi_sec2_remove(); ++#endif + } + +-static int wd_cipher_open_driver(void) ++static int wd_cipher_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; + const char *alg_name = "cbc(aes)"; ++#ifndef WD_STATIC_DRV + char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open tham by wd_dlopen_drv() ++ * use NULL means dynamic query path ++ */ ++ wd_cipher_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_cipher_setting.dlh_list) { ++ WD_ERR("fail to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_sec.so", lib_path, false); + if (ret) + return ret; +@@ -97,17 +123,21 @@ static int wd_cipher_open_driver(void) + WD_ERR("failed to open libhisi_sec.so, %s\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_sec2_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_cipher_close_driver(); ++ wd_cipher_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support\n", alg_name); + return -WD_EINVAL; + } + + wd_cipher_setting.driver = driver; + +- return 0; ++ return WD_SUCCESS; + } + + static bool is_des_weak_key(const __u8 *key) +@@ -365,7 +395,7 @@ int wd_cipher_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_cipher_open_driver(); ++ ret = wd_cipher_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -378,7 +408,7 @@ int wd_cipher_init(struct wd_ctx_config *config, struct wd_sched *sched) + return 0; + + out_close_driver: +- wd_cipher_close_driver(); ++ wd_cipher_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_cipher_setting.status); + return ret; +@@ -392,7 +422,7 @@ void wd_cipher_uninit(void) + if (ret) + return; + +- wd_cipher_close_driver(); ++ wd_cipher_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_cipher_setting.status); + } + +@@ -421,16 +451,9 @@ int wd_cipher_init2_(char *alg, __u32 sched_type, int task_type, struct wd_ctx_p + goto out_uninit; + } + +- /* +- * Driver lib file path could set by env param. +- * then open tham by wd_dlopen_drv() +- * use NULL means dynamic query path +- */ +- wd_cipher_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_cipher_setting.dlh_list) { +- WD_ERR("fail to open driver lib files.\n"); ++ state = wd_cipher_open_driver(WD_TYPE_V2); ++ if (state) + goto out_uninit; +- } + + while (ret != 0) { + memset(&wd_cipher_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -484,7 +507,7 @@ out_params_uninit: + out_driver: + wd_alg_drv_unbind(wd_cipher_setting.driver); + out_dlopen: +- wd_dlclose_drv(wd_cipher_setting.dlh_list); ++ wd_cipher_close_driver(WD_TYPE_V2); + out_uninit: + wd_alg_clear_init(&wd_cipher_setting.status); + return ret; +@@ -500,7 +523,7 @@ void wd_cipher_uninit2(void) + + wd_alg_attrs_uninit(&wd_cipher_init_attrs); + wd_alg_drv_unbind(wd_cipher_setting.driver); +- wd_dlclose_drv(wd_cipher_setting.dlh_list); ++ wd_cipher_close_driver(WD_TYPE_V2); + wd_cipher_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_cipher_setting.status); + } +diff --git a/wd_comp.c b/wd_comp.c +index cabd17f..459223e 100644 +--- a/wd_comp.c ++++ b/wd_comp.c +@@ -54,22 +54,48 @@ struct wd_comp_setting { + struct wd_env_config wd_comp_env_config; + static struct wd_init_attrs wd_comp_init_attrs; + +-static void wd_comp_close_driver(void) ++static void wd_comp_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_comp_setting.dlh_list); ++ return; ++ } ++ + if (wd_comp_setting.dlhandle) { + wd_release_drv(wd_comp_setting.driver); + dlclose(wd_comp_setting.dlhandle); + wd_comp_setting.dlhandle = NULL; + } ++#else ++ wd_release_drv(wd_comp_setting.driver); ++ hisi_zip_remove(); ++#endif + } + +-static int wd_comp_open_driver(void) ++static int wd_comp_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; +- char lib_path[PATH_MAX]; + const char *alg_name = "zlib"; ++#ifndef WD_STATIC_DRV ++ char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open them by wd_dlopen_drv() ++ * use NULL means dynamic query path ++ */ ++ wd_comp_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_comp_setting.dlh_list) { ++ WD_ERR("fail to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_zip.so", lib_path, false); + if (ret) + return ret; +@@ -79,17 +105,21 @@ static int wd_comp_open_driver(void) + WD_ERR("failed to open libhisi_zip.so, %s\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_zip_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_comp_close_driver(); ++ wd_comp_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support\n", alg_name); + return -WD_EINVAL; + } + + wd_comp_setting.driver = driver; + +- return 0; ++ return WD_SUCCESS; + } + + static void wd_comp_clear_status(void) +@@ -185,7 +215,7 @@ int wd_comp_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_comp_open_driver(); ++ ret = wd_comp_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -198,7 +228,7 @@ int wd_comp_init(struct wd_ctx_config *config, struct wd_sched *sched) + return 0; + + out_clear_driver: +- wd_comp_close_driver(); ++ wd_comp_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_comp_setting.status); + return ret; +@@ -212,7 +242,7 @@ void wd_comp_uninit(void) + if (ret) + return; + +- wd_comp_close_driver(); ++ wd_comp_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_comp_setting.status); + } + +@@ -241,16 +271,9 @@ int wd_comp_init2_(char *alg, __u32 sched_type, int task_type, struct wd_ctx_par + goto out_uninit; + } + +- /* +- * Driver lib file path could set by env param. +- * then open tham by wd_dlopen_drv() +- * use NULL means dynamic query path +- */ +- wd_comp_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_comp_setting.dlh_list) { +- WD_ERR("fail to open driver lib files.\n"); ++ state = wd_comp_open_driver(WD_TYPE_V2); ++ if (state) + goto out_uninit; +- } + + while (ret != 0) { + memset(&wd_comp_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -303,7 +326,7 @@ out_params_uninit: + out_unbind_drv: + wd_alg_drv_unbind(wd_comp_setting.driver); + out_dlclose: +- wd_dlclose_drv(wd_comp_setting.dlh_list); ++ wd_comp_close_driver(WD_TYPE_V2); + out_uninit: + wd_alg_clear_init(&wd_comp_setting.status); + return ret; +@@ -319,7 +342,7 @@ void wd_comp_uninit2(void) + + wd_alg_attrs_uninit(&wd_comp_init_attrs); + wd_alg_drv_unbind(wd_comp_setting.driver); +- wd_dlclose_drv(wd_comp_setting.dlh_list); ++ wd_comp_close_driver(WD_TYPE_V2); + wd_comp_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_comp_setting.status); + } +diff --git a/wd_dh.c b/wd_dh.c +index 4d08de6..36b0cd7 100644 +--- a/wd_dh.c ++++ b/wd_dh.c +@@ -41,23 +41,49 @@ static struct wd_dh_setting { + struct wd_env_config wd_dh_env_config; + static struct wd_init_attrs wd_dh_init_attrs; + +-static void wd_dh_close_driver(void) ++static void wd_dh_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_dh_setting.dlh_list); ++ return; ++ } ++ + if (!wd_dh_setting.dlhandle) + return; + + wd_release_drv(wd_dh_setting.driver); + dlclose(wd_dh_setting.dlhandle); + wd_dh_setting.dlhandle = NULL; ++#else ++ wd_release_drv(wd_dh_setting.driver); ++ hisi_hpre_remove(); ++#endif + } + +-static int wd_dh_open_driver(void) ++static int wd_dh_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; +- char lib_path[PATH_MAX]; + const char *alg_name = "dh"; ++#ifndef WD_STATIC_DRV ++ char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open them by wd_dlopen_drv() ++ * default dir in the /root/lib/xxx.so and then dlopen ++ */ ++ wd_dh_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_dh_setting.dlh_list) { ++ WD_ERR("failed to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_hpre.so", lib_path, false); + if (ret) + return ret; +@@ -67,10 +93,14 @@ static int wd_dh_open_driver(void) + WD_ERR("failed to open libhisi_hpre.so, %s!\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_hpre_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_dh_close_driver(); ++ wd_dh_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support\n", alg_name); + return -WD_EINVAL; + } +@@ -158,7 +188,7 @@ int wd_dh_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_dh_open_driver(); ++ ret = wd_dh_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -171,7 +201,7 @@ int wd_dh_init(struct wd_ctx_config *config, struct wd_sched *sched) + return WD_SUCCESS; + + out_close_driver: +- wd_dh_close_driver(); ++ wd_dh_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_dh_setting.status); + return ret; +@@ -185,7 +215,7 @@ void wd_dh_uninit(void) + if (ret) + return; + +- wd_dh_close_driver(); ++ wd_dh_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_dh_setting.status); + } + +@@ -212,16 +242,9 @@ int wd_dh_init2_(char *alg, __u32 sched_type, int task_type, struct wd_ctx_param + goto out_clear_init; + } + +- /* +- * Driver lib file path could set by env param. +- * than open tham by wd_dlopen_drv() +- * default dir in the /root/lib/xxx.so and then dlopen +- */ +- wd_dh_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_dh_setting.dlh_list) { +- WD_ERR("failed to open driver lib files!\n"); ++ state = wd_dh_open_driver(WD_TYPE_V2); ++ if (state) + goto out_clear_init; +- } + + while (ret) { + memset(&wd_dh_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -275,7 +298,7 @@ out_params_uninit: + out_driver: + wd_alg_drv_unbind(wd_dh_setting.driver); + out_dlopen: +- wd_dlclose_drv(wd_dh_setting.dlh_list); ++ wd_dh_close_driver(WD_TYPE_V2); + out_clear_init: + wd_alg_clear_init(&wd_dh_setting.status); + return ret; +@@ -291,7 +314,7 @@ void wd_dh_uninit2(void) + + wd_alg_attrs_uninit(&wd_dh_init_attrs); + wd_alg_drv_unbind(wd_dh_setting.driver); +- wd_dlclose_drv(wd_dh_setting.dlh_list); ++ wd_dh_close_driver(WD_TYPE_V2); + wd_dh_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_dh_setting.status); + } +diff --git a/wd_digest.c b/wd_digest.c +index 0df7204..7449259 100644 +--- a/wd_digest.c ++++ b/wd_digest.c +@@ -73,22 +73,48 @@ struct wd_digest_sess { + struct wd_env_config wd_digest_env_config; + static struct wd_init_attrs wd_digest_init_attrs; + +-static void wd_digest_close_driver(void) ++static void wd_digest_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_digest_setting.dlh_list); ++ return; ++ } ++ + if (wd_digest_setting.dlhandle) { + wd_release_drv(wd_digest_setting.driver); + dlclose(wd_digest_setting.dlhandle); + wd_digest_setting.dlhandle = NULL; + } ++#else ++ wd_release_drv(wd_digest_setting.driver); ++ hisi_sec2_remove(); ++#endif + } + +-static int wd_digest_open_driver(void) ++static int wd_digest_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; + const char *alg_name = "sm3"; ++#ifndef WD_STATIC_DRV + char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open tham by wd_dlopen_drv() ++ * use NULL means dynamic query path ++ */ ++ wd_digest_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_digest_setting.dlh_list) { ++ WD_ERR("fail to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_sec.so", lib_path, false); + if (ret) + return ret; +@@ -98,17 +124,21 @@ static int wd_digest_open_driver(void) + WD_ERR("failed to open libhisi_sec.so, %s\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_sec2_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_digest_close_driver(); ++ wd_digest_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support\n", alg_name); + return -WD_EINVAL; + } + + wd_digest_setting.driver = driver; + +- return 0; ++ return WD_SUCCESS; + } + + static int aes_key_len_check(__u32 length) +@@ -277,7 +307,7 @@ int wd_digest_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_digest_open_driver(); ++ ret = wd_digest_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -290,7 +320,7 @@ int wd_digest_init(struct wd_ctx_config *config, struct wd_sched *sched) + return 0; + + out_close_driver: +- wd_digest_close_driver(); ++ wd_digest_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_digest_setting.status); + return ret; +@@ -319,7 +349,7 @@ void wd_digest_uninit(void) + if (ret) + return; + +- wd_digest_close_driver(); ++ wd_digest_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_digest_setting.status); + } + +@@ -356,16 +386,11 @@ int wd_digest_init2_(char *alg, __u32 sched_type, int task_type, + WD_ERR("invalid: digest:%s unsupported!\n", alg); + goto out_uninit; + } +- /* +- * Driver lib file path could set by env param. +- * then open them by wd_dlopen_drv() +- * use NULL means dynamic query path +- */ +- wd_digest_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_digest_setting.dlh_list) { +- WD_ERR("failed to open driver lib files.\n"); ++ ++ state = wd_digest_open_driver(WD_TYPE_V2); ++ if (state) + goto out_uninit; +- } ++ + + while (ret != 0) { + memset(&wd_digest_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -417,7 +442,7 @@ out_params_uninit: + out_driver: + wd_alg_drv_unbind(wd_digest_setting.driver); + out_dlopen: +- wd_dlclose_drv(wd_digest_setting.dlh_list); ++ wd_digest_close_driver(WD_TYPE_V2); + out_uninit: + wd_alg_clear_init(&wd_digest_setting.status); + return ret; +@@ -433,7 +458,7 @@ void wd_digest_uninit2(void) + + wd_alg_attrs_uninit(&wd_digest_init_attrs); + wd_alg_drv_unbind(wd_digest_setting.driver); +- wd_dlclose_drv(wd_digest_setting.dlh_list); ++ wd_digest_close_driver(WD_TYPE_V2); + wd_digest_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_digest_setting.status); + } +diff --git a/wd_ecc.c b/wd_ecc.c +index e75bca0..24f167f 100644 +--- a/wd_ecc.c ++++ b/wd_ecc.c +@@ -95,23 +95,49 @@ static const struct curve_param_desc curve_pram_list[] = { + { ECC_CURVE_G, offsetof(struct wd_ecc_prikey, g), offsetof(struct wd_ecc_pubkey, g) } + }; + +-static void wd_ecc_close_driver(void) ++static void wd_ecc_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_ecc_setting.dlh_list); ++ return; ++ } ++ + if (!wd_ecc_setting.dlhandle) + return; + + wd_release_drv(wd_ecc_setting.driver); + dlclose(wd_ecc_setting.dlhandle); + wd_ecc_setting.dlhandle = NULL; ++#else ++ wd_release_drv(wd_ecc_setting.driver); ++ hisi_hpre_remove(); ++#endif + } + +-static int wd_ecc_open_driver(void) ++static int wd_ecc_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; +- char lib_path[PATH_MAX]; + const char *alg_name = "sm2"; ++#ifndef WD_STATIC_DRV ++ char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open them by wd_dlopen_drv() ++ * default dir in the /root/lib/xxx.so and then dlopen ++ */ ++ wd_ecc_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_ecc_setting.dlh_list) { ++ WD_ERR("failed to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_hpre.so", lib_path, false); + if (ret) + return ret; +@@ -121,10 +147,14 @@ static int wd_ecc_open_driver(void) + WD_ERR("failed to open libhisi_hpre.so, %s!\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_hpre_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_ecc_close_driver(); ++ wd_ecc_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support\n", alg_name); + return -WD_EINVAL; + } +@@ -221,7 +251,7 @@ int wd_ecc_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_ecc_open_driver(); ++ ret = wd_ecc_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -234,7 +264,7 @@ int wd_ecc_init(struct wd_ctx_config *config, struct wd_sched *sched) + return WD_SUCCESS; + + out_close_driver: +- wd_ecc_close_driver(); ++ wd_ecc_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_ecc_setting.status); + return ret; +@@ -248,7 +278,7 @@ void wd_ecc_uninit(void) + if (ret) + return; + +- wd_ecc_close_driver(); ++ wd_ecc_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_ecc_setting.status); + } + +@@ -277,16 +307,9 @@ int wd_ecc_init2_(char *alg, __u32 sched_type, int task_type, struct wd_ctx_para + goto out_clear_init; + } + +- /* +- * Driver lib file path could set by env param. +- * than open tham by wd_dlopen_drv() +- * default dir in the /root/lib/xxx.so and then dlopen +- */ +- wd_ecc_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_ecc_setting.dlh_list) { +- WD_ERR("failed to open driver lib files!\n"); ++ state = wd_ecc_open_driver(WD_TYPE_V2); ++ if (state) + goto out_clear_init; +- } + + while (ret) { + memset(&wd_ecc_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -340,7 +363,7 @@ out_params_uninit: + out_driver: + wd_alg_drv_unbind(wd_ecc_setting.driver); + out_dlopen: +- wd_dlclose_drv(wd_ecc_setting.dlh_list); ++ wd_ecc_close_driver(WD_TYPE_V2); + out_clear_init: + wd_alg_clear_init(&wd_ecc_setting.status); + return ret; +@@ -356,7 +379,7 @@ void wd_ecc_uninit2(void) + + wd_alg_attrs_uninit(&wd_ecc_init_attrs); + wd_alg_drv_unbind(wd_ecc_setting.driver); +- wd_dlclose_drv(wd_ecc_setting.dlh_list); ++ wd_ecc_close_driver(WD_TYPE_V2); + wd_ecc_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_ecc_setting.status); + } +diff --git a/wd_rsa.c b/wd_rsa.c +index 8e51177..f7f815c 100644 +--- a/wd_rsa.c ++++ b/wd_rsa.c +@@ -82,23 +82,49 @@ static struct wd_rsa_setting { + struct wd_env_config wd_rsa_env_config; + static struct wd_init_attrs wd_rsa_init_attrs; + +-static void wd_rsa_close_driver(void) ++static void wd_rsa_close_driver(int init_type) + { ++#ifndef WD_STATIC_DRV ++ if (init_type == WD_TYPE_V2) { ++ wd_dlclose_drv(wd_rsa_setting.dlh_list); ++ return; ++ } ++ + if (!wd_rsa_setting.dlhandle) + return; + + wd_release_drv(wd_rsa_setting.driver); + dlclose(wd_rsa_setting.dlhandle); + wd_rsa_setting.dlhandle = NULL; ++#else ++ wd_release_drv(wd_rsa_setting.driver); ++ hisi_hpre_remove(); ++#endif + } + +-static int wd_rsa_open_driver(void) ++static int wd_rsa_open_driver(int init_type) + { + struct wd_alg_driver *driver = NULL; +- char lib_path[PATH_MAX]; + const char *alg_name = "rsa"; ++#ifndef WD_STATIC_DRV ++ char lib_path[PATH_MAX]; + int ret; + ++ if (init_type == WD_TYPE_V2) { ++ /* ++ * Driver lib file path could set by env param. ++ * then open them by wd_dlopen_drv() ++ * default dir in the /root/lib/xxx.so and then dlopen ++ */ ++ wd_rsa_setting.dlh_list = wd_dlopen_drv(NULL); ++ if (!wd_rsa_setting.dlh_list) { ++ WD_ERR("failed to open driver lib files.\n"); ++ return -WD_EINVAL; ++ } ++ ++ return WD_SUCCESS; ++ } ++ + ret = wd_get_lib_file_path("libhisi_hpre.so", lib_path, false); + if (ret) + return ret; +@@ -108,10 +134,14 @@ static int wd_rsa_open_driver(void) + WD_ERR("failed to open libhisi_hpre.so, %s!\n", dlerror()); + return -WD_EINVAL; + } +- ++#else ++ hisi_hpre_probe(); ++ if (init_type == WD_TYPE_V2) ++ return WD_SUCCESS; ++#endif + driver = wd_request_drv(alg_name, false); + if (!driver) { +- wd_rsa_close_driver(); ++ wd_rsa_close_driver(WD_TYPE_V1); + WD_ERR("failed to get %s driver support!\n", alg_name); + return -WD_EINVAL; + } +@@ -198,7 +228,7 @@ int wd_rsa_init(struct wd_ctx_config *config, struct wd_sched *sched) + if (ret) + goto out_clear_init; + +- ret = wd_rsa_open_driver(); ++ ret = wd_rsa_open_driver(WD_TYPE_V1); + if (ret) + goto out_clear_init; + +@@ -211,7 +241,7 @@ int wd_rsa_init(struct wd_ctx_config *config, struct wd_sched *sched) + return WD_SUCCESS; + + out_close_driver: +- wd_rsa_close_driver(); ++ wd_rsa_close_driver(WD_TYPE_V1); + out_clear_init: + wd_alg_clear_init(&wd_rsa_setting.status); + return ret; +@@ -225,7 +255,7 @@ void wd_rsa_uninit(void) + if (ret) + return; + +- wd_rsa_close_driver(); ++ wd_rsa_close_driver(WD_TYPE_V1); + wd_alg_clear_init(&wd_rsa_setting.status); + } + +@@ -252,16 +282,9 @@ int wd_rsa_init2_(char *alg, __u32 sched_type, int task_type, struct wd_ctx_para + goto out_clear_init; + } + +- /* +- * Driver lib file path could set by env param. +- * than open tham by wd_dlopen_drv() +- * default dir in the /root/lib/xxx.so and then dlopen +- */ +- wd_rsa_setting.dlh_list = wd_dlopen_drv(NULL); +- if (!wd_rsa_setting.dlh_list) { +- WD_ERR("failed to open driver lib files!\n"); ++ state = wd_rsa_open_driver(WD_TYPE_V2); ++ if (state) + goto out_clear_init; +- } + + while (ret) { + memset(&wd_rsa_setting.config, 0, sizeof(struct wd_ctx_config_internal)); +@@ -315,7 +338,7 @@ out_params_uninit: + out_driver: + wd_alg_drv_unbind(wd_rsa_setting.driver); + out_dlopen: +- wd_dlclose_drv(wd_rsa_setting.dlh_list); ++ wd_rsa_close_driver(WD_TYPE_V2); + out_clear_init: + wd_alg_clear_init(&wd_rsa_setting.status); + return ret; +@@ -331,7 +354,7 @@ void wd_rsa_uninit2(void) + + wd_alg_attrs_uninit(&wd_rsa_init_attrs); + wd_alg_drv_unbind(wd_rsa_setting.driver); +- wd_dlclose_drv(wd_rsa_setting.dlh_list); ++ wd_rsa_close_driver(WD_TYPE_V2); + wd_rsa_setting.dlh_list = NULL; + wd_alg_clear_init(&wd_rsa_setting.status); + } +-- +2.25.1 + diff --git a/warpdrive.spec b/libwd.spec similarity index 67% rename from warpdrive.spec rename to libwd.spec index 347eada..5a06c72 100644 --- a/warpdrive.spec +++ b/libwd.spec @@ -2,19 +2,10 @@ Name: libwd Summary: User Space Accelerator Development Kit Version: 2.6.0 -Release: 2 +Release: 3 License: Apache-2.0 Source: %{name}-%{version}.tar.gz -Patch01: 0001-uadk-fix-build-issue-of-pthread_atfork.patch -Patch02: 0002-uadk-fix-static-build-error.patch -Patch03: 0003-uadk-add-secure-compilation-option.patch -Patch04: 0004-uadk_tool-fix-build-error.patch -Patch05: 0005-v1-fix-build-error.patch -Patch06: 0006-wd_mempool-fix-build-error.patch -Patch07: 0007-wd_rsa-fix-build-error.patch -Patch08: 0008-test-fix-build-error.patch - Vendor: Huawei Corporation ExclusiveOS: linux URL: https://support.huawei.com @@ -26,6 +17,59 @@ BuildRequires: automake, autoconf, libtool, chrpath BuildRequires: gcc, make ExclusiveArch: aarch64 +Patch01: 0001-uadk-fix-build-issue-of-pthread_atfork.patch +Patch02: 0002-uadk-fix-static-build-error.patch +Patch03: 0003-uadk-add-secure-compilation-option.patch +Patch04: 0004-uadk_tool-fix-build-error.patch +Patch05: 0005-v1-fix-build-error.patch +Patch06: 0006-wd_mempool-fix-build-error.patch +Patch07: 0007-wd_rsa-fix-build-error.patch +Patch08: 0008-test-fix-build-error.patch +Patch0009: 0009-uadk-sec-move-function-to-wd_digest_drv.h.patch +Patch0010: 0010-uadk-digest-add-partial_block-to-store-partial-data.patch +Patch0011: 0011-uadk-digest-add-wd_ctx_spin_lock-function.patch +Patch0012: 0012-uadk-remove-redundant-header-file-in-makefile.patch +Patch0013: 0013-uadk-isa-ce-support-sm3-ce-instruction.patch +Patch0014: 0014-uadk-fix-control-range-of-environmemt-variable.patch +Patch0015: 0015-uadk-util-use-default-sched_type-for-instruction-tas.patch +Patch0016: 0016-uadk-digest-modify-spelling-errors.patch +Patch0017: 0017-uadk-drv-hisi-fix-failed-to-init-drv-after-fork.patch +Patch0018: 0018-wd_rsa-fix-wd_rsa_common_uninit-re-entry.patch +Patch0019: 0019-wd_dh-Fix-wd_aead_uninit-re-entry.patch +Patch0020: 0020-wd_ecc-Fix-wd_ecc_uninit-re-entry.patch +Patch0021: 0021-wd_digest-uninit-check-status-in-one-func.patch +Patch0022: 0022-wd_aead-uninit-check-status-in-one-func.patch +Patch0023: 0023-makefile-install-wd_zlibwrapper.h-to-system.patch +Patch0024: 0024-conf-fix-includedir.patch +Patch0025: 0025-cipher-add-support-for-SM4-CBC-and-CTR-modes-in-CE-i.patch +Patch0026: 0026-cipher-add-support-for-SM4-CFB-and-XTS-modes-in-CE-i.patch +Patch0027: 0027-cipher-add-support-for-SM4-ECB-algorithm-in-CE-instr.patch +Patch0028: 0028-uadk-cipher-isa_ce-support-SM4-cbc_cts-mode.patch +Patch0029: 0029-uadk-wd_alg-check-whether-the-platform-supports-SVE.patch +Patch0030: 0030-uadk-sched-fix-async-mode-ctx-id.patch +Patch0031: 0031-uadk-initializes-ctx-resources-in-SVE-mode.patch +Patch0032: 0032-uadk-hash_mb-support-multi-buffer-calculation-for-sm.patch +Patch0033: 0033-uadk_tool-fix-aead-performance-test-issue.patch +Patch0034: 0034-uadk_tool-fix-the-logic-for-counting-retransmissions.patch +Patch0035: 0035-uadk-tools-support-the-nosva-test-of-a-specified-dev.patch +Patch0036: 0036-uadk-tools-support-designated-device-testing.patch +Patch0037: 0037-uadk_tool-support-sm3-ce-benchmark-and-function-test.patch +Patch0038: 0038-uadk_tool-support-sm4-ce-benchmark-test.patch +Patch0039: 0039-uadk_tool-support-sm3-md5-multibuff-benchmark-test.patch +Patch0040: 0040-uadk-tool-fix-the-msg-pool-release-bug-of-async-zip-.patch +Patch0041: 0041-uadk_tool-fix-queue-application-failure-from-multipl.patch +Patch0042: 0042-ecc-check-need_debug-before-calling-WD_DEBUG.patch +Patch0043: 0043-uadk-remove-unused-ioctl-cmd.patch +Patch0044: 0044-uadk-v1-remove-dummy.patch +Patch0045: 0045-cipher-optimze-input-lengths-check.patch +Patch0046: 0046-uadk-v1-improve-the-judgment-conditions-of-tag.patch +Patch0047: 0047-uadk-v1-fix-for-sec-cipher-bd1-ci_gen-configuration.patch +Patch0048: 0048-uadk-fix-for-shmget-shmflag.patch +Patch0049: 0049-sec-optimze-for-directly-assigning-values-to-structu.patch +Patch0050: 0050-util-optimize-for-wd_handle_msg_sync.patch +Patch0051: 0051-uadk-drv_hisi-optimize-qm-recv-function.patch +Patch0052: 0052-uadk-modify-uadk-static-compile.patch + %description This package contains the User Space Accelerator Library for hardware accelerator, compress, symmetric encryption @@ -102,6 +146,9 @@ rm -rf ${RPM_BUILD_ROOT} /sbin/ldconfig %changelog +* Sun Apr 7 2024 JiangShui Yang 2.6.0-3 +- libwd: update the source code + * Thu Feb 22 2024 JiangShui Yang 2.6.0-2 - libwd: simplify warpdrive.spec