4093 lines
110 KiB
Diff
4093 lines
110 KiB
Diff
From 5dbc53c96ac4efcf26b4dbcdbbf55d1b5e7a06be Mon Sep 17 00:00:00 2001
|
|
From: Weili Qian <qianweili@huawei.com>
|
|
Date: Sat, 23 Mar 2024 18:00:43 +0800
|
|
Subject: [PATCH 32/44] uadk/hash_mb: support multi-buffer calculation for sm3
|
|
and md5
|
|
|
|
Supports sm3 and md5 multi-buffer calculation by using SVE instructions.
|
|
If the platform supports SVE instructions, uesrs can choose SVE instructions
|
|
to perform sm3 and md5 algorithm calculation.
|
|
|
|
The assembly implementation is from isa-l_crypto:
|
|
https://github.com/intel/isa-l_crypto.git
|
|
|
|
Signed-off-by: Weili Qian <qianweili@huawei.com>
|
|
---
|
|
Makefile.am | 15 +-
|
|
drv/hash_mb/hash_mb.c | 843 ++++++++++++++++++++++++++++++++++
|
|
drv/hash_mb/hash_mb.h | 62 +++
|
|
drv/hash_mb/md5_mb_asimd_x1.S | 248 ++++++++++
|
|
drv/hash_mb/md5_mb_asimd_x4.S | 526 +++++++++++++++++++++
|
|
drv/hash_mb/md5_mb_sve.S | 158 +++++++
|
|
drv/hash_mb/md5_sve_common.S | 478 +++++++++++++++++++
|
|
drv/hash_mb/sm3_mb_asimd_x1.S | 387 ++++++++++++++++
|
|
drv/hash_mb/sm3_mb_asimd_x4.S | 576 +++++++++++++++++++++++
|
|
drv/hash_mb/sm3_mb_sve.S | 161 +++++++
|
|
drv/hash_mb/sm3_sve_common.S | 505 ++++++++++++++++++++
|
|
11 files changed, 3958 insertions(+), 1 deletion(-)
|
|
create mode 100644 drv/hash_mb/hash_mb.c
|
|
create mode 100644 drv/hash_mb/hash_mb.h
|
|
create mode 100644 drv/hash_mb/md5_mb_asimd_x1.S
|
|
create mode 100644 drv/hash_mb/md5_mb_asimd_x4.S
|
|
create mode 100644 drv/hash_mb/md5_mb_sve.S
|
|
create mode 100644 drv/hash_mb/md5_sve_common.S
|
|
create mode 100644 drv/hash_mb/sm3_mb_asimd_x1.S
|
|
create mode 100644 drv/hash_mb/sm3_mb_asimd_x4.S
|
|
create mode 100644 drv/hash_mb/sm3_mb_sve.S
|
|
create mode 100644 drv/hash_mb/sm3_sve_common.S
|
|
|
|
diff --git a/Makefile.am b/Makefile.am
|
|
index f78ad14..68f3106 100644
|
|
--- a/Makefile.am
|
|
+++ b/Makefile.am
|
|
@@ -45,7 +45,7 @@ lib_LTLIBRARIES=libwd.la libwd_comp.la libwd_crypto.la
|
|
|
|
uadk_driversdir=$(libdir)/uadk
|
|
uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la \
|
|
- libisa_ce.la
|
|
+ libisa_ce.la libisa_sve.la
|
|
|
|
libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \
|
|
v1/wd.c v1/wd.h v1/wd_adapter.c v1/wd_adapter.h \
|
|
@@ -94,6 +94,12 @@ libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \
|
|
libisa_ce_la_SOURCES=arm_arch_ce.h drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S isa_ce_sm3.h \
|
|
drv/isa_ce_sm4.c drv/isa_ce_sm4_armv8.S drv/isa_ce_sm4.h
|
|
|
|
+libisa_sve_la_SOURCES=drv/hash_mb/hash_mb.c wd_digest_drv.h drv/hash_mb/hash_mb.h \
|
|
+ drv/hash_mb/sm3_sve_common.S drv/hash_mb/sm3_mb_asimd_x1.S \
|
|
+ drv/hash_mb/sm3_mb_asimd_x4.S drv/hash_mb/sm3_mb_sve.S \
|
|
+ drv/hash_mb/md5_sve_common.S drv/hash_mb/md5_mb_asimd_x1.S \
|
|
+ drv/hash_mb/md5_mb_asimd_x4.S drv/hash_mb/md5_mb_sve.S
|
|
+
|
|
if WD_STATIC_DRV
|
|
AM_CFLAGS += -DWD_STATIC_DRV -fPIC
|
|
AM_CFLAGS += -DWD_NO_LOG
|
|
@@ -117,6 +123,9 @@ libhisi_hpre_la_DEPENDENCIES = libwd.la libwd_crypto.la
|
|
libisa_ce_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS)
|
|
libisa_ce_la_DEPENDENCIES = libwd.la libwd_crypto.la
|
|
|
|
+libisa_sve_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS)
|
|
+libisa_sve_la_DEPENDENCIES = libwd.la libwd_crypto.la
|
|
+
|
|
else
|
|
UADK_WD_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd.map
|
|
UADK_CRYPTO_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd_crypto.map
|
|
@@ -149,6 +158,10 @@ libhisi_hpre_la_DEPENDENCIES= libwd.la libwd_crypto.la
|
|
libisa_ce_la_LIBADD= -lwd -lwd_crypto
|
|
libisa_ce_la_LDFLAGS=$(UADK_VERSION)
|
|
libisa_ce_la_DEPENDENCIES= libwd.la libwd_crypto.la
|
|
+
|
|
+libisa_sve_la_LIBADD= -lwd -lwd_crypto
|
|
+libisa_sve_la_LDFLAGS=$(UADK_VERSION)
|
|
+libisa_sve_la_DEPENDENCIES= libwd.la libwd_crypto.la
|
|
endif # WD_STATIC_DRV
|
|
|
|
pkgconfigdir = $(libdir)/pkgconfig
|
|
diff --git a/drv/hash_mb/hash_mb.c b/drv/hash_mb/hash_mb.c
|
|
new file mode 100644
|
|
index 0000000..a73c698
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/hash_mb.c
|
|
@@ -0,0 +1,843 @@
|
|
+/* SPDX-License-Identifier: Apache-2.0 */
|
|
+/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */
|
|
+
|
|
+#include <sys/auxv.h>
|
|
+#include <pthread.h>
|
|
+#include <stdlib.h>
|
|
+#include <string.h>
|
|
+#include "hash_mb.h"
|
|
+
|
|
+#define MIN(a, b) (((a) > (b)) ? (b) : (a))
|
|
+#define IPAD_VALUE 0x36
|
|
+#define OPAD_VALUE 0x5C
|
|
+#define HASH_KEY_LEN 64
|
|
+#define HASH_BLOCK_OFFSET 6
|
|
+#define HASH_BLOCK_SIZE 64
|
|
+#define HASH_PADLENGTHFIELD_SIZE 56
|
|
+#define HASH_PADDING_SIZE 120
|
|
+#define HASH_HIGH_32BITS 32
|
|
+#define HASH_PADDING_BLOCKS 2
|
|
+#define HASH_NENO_PROCESS_JOBS 4
|
|
+#define HASH_TRY_PROCESS_COUNT 16
|
|
+#define BYTES_TO_BITS_OFFSET 3
|
|
+
|
|
+#define MD5_DIGEST_DATA_SIZE 16
|
|
+#define SM3_DIGEST_DATA_SIZE 32
|
|
+#define HASH_MAX_LANES 32
|
|
+#define SM3_MAX_LANES 16
|
|
+
|
|
+#define PUTU32(p, V) \
|
|
+ ((p)[0] = (uint8_t)((V) >> 24), \
|
|
+ (p)[1] = (uint8_t)((V) >> 16), \
|
|
+ (p)[2] = (uint8_t)((V) >> 8), \
|
|
+ (p)[3] = (uint8_t)(V))
|
|
+
|
|
+struct hash_mb_ops {
|
|
+ int (*max_lanes)(void);
|
|
+ void (*asimd_x4)(struct hash_job *job1, struct hash_job *job2,
|
|
+ struct hash_job *job3, struct hash_job *job4, int len);
|
|
+ void (*asimd_x1)(struct hash_job *job, int len);
|
|
+ void (*sve)(int blocks, int total_lanes, struct hash_job **job_vec);
|
|
+ __u8 *iv_data;
|
|
+ int iv_bytes;
|
|
+ int max_jobs;
|
|
+};
|
|
+
|
|
+struct hash_mb_poll_queue {
|
|
+ struct hash_job *head;
|
|
+ struct hash_job *tail;
|
|
+ pthread_spinlock_t s_lock;
|
|
+ const struct hash_mb_ops *ops;
|
|
+ __u32 job_num;
|
|
+};
|
|
+
|
|
+struct hash_mb_queue {
|
|
+ struct hash_mb_poll_queue sm3_poll_queue;
|
|
+ struct hash_mb_poll_queue md5_poll_queue;
|
|
+ pthread_spinlock_t r_lock;
|
|
+ struct hash_job *recv_head;
|
|
+ struct hash_job *recv_tail;
|
|
+ __u32 complete_cnt;
|
|
+ __u8 ctx_mode;
|
|
+};
|
|
+
|
|
+struct hash_mb_ctx {
|
|
+ struct wd_ctx_config_internal config;
|
|
+};
|
|
+
|
|
+static __u8 sm3_iv_data[SM3_DIGEST_DATA_SIZE] = {
|
|
+ 0x73, 0x80, 0x16, 0x6f, 0x49, 0x14, 0xb2, 0xb9,
|
|
+ 0x17, 0x24, 0x42, 0xd7, 0xda, 0x8a, 0x06, 0x00,
|
|
+ 0xa9, 0x6f, 0x30, 0xbc, 0x16, 0x31, 0x38, 0xaa,
|
|
+ 0xe3, 0x8d, 0xee, 0x4d, 0xb0, 0xfb, 0x0e, 0x4e,
|
|
+};
|
|
+
|
|
+static __u8 md5_iv_data[MD5_DIGEST_DATA_SIZE] = {
|
|
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
|
|
+ 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
|
|
+};
|
|
+
|
|
+static struct hash_mb_ops md5_ops = {
|
|
+ .max_lanes = md5_mb_sve_max_lanes,
|
|
+ .asimd_x4 = md5_mb_asimd_x4,
|
|
+ .asimd_x1 = md5_mb_asimd_x1,
|
|
+ .sve = md5_mb_sve,
|
|
+ .iv_data = md5_iv_data,
|
|
+ .iv_bytes = MD5_DIGEST_DATA_SIZE,
|
|
+ .max_jobs = HASH_MAX_LANES,
|
|
+};
|
|
+
|
|
+static struct hash_mb_ops sm3_ops = {
|
|
+ .max_lanes = sm3_mb_sve_max_lanes,
|
|
+ .asimd_x4 = sm3_mb_asimd_x4,
|
|
+ .asimd_x1 = sm3_mb_asimd_x1,
|
|
+ .sve = sm3_mb_sve,
|
|
+ .iv_data = sm3_iv_data,
|
|
+ .iv_bytes = SM3_DIGEST_DATA_SIZE,
|
|
+ .max_jobs = SM3_MAX_LANES,
|
|
+};
|
|
+
|
|
+static void hash_mb_uninit_poll_queue(struct hash_mb_poll_queue *poll_queue)
|
|
+{
|
|
+ pthread_spin_destroy(&poll_queue->s_lock);
|
|
+}
|
|
+
|
|
+static void hash_mb_queue_uninit(struct wd_ctx_config_internal *config, int ctx_num)
|
|
+{
|
|
+ struct hash_mb_queue *mb_queue;
|
|
+ struct wd_soft_ctx *ctx;
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; i < ctx_num; i++) {
|
|
+ ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx;
|
|
+ mb_queue = ctx->priv;
|
|
+ pthread_spin_destroy(&mb_queue->r_lock);
|
|
+ hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue);
|
|
+ hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue);
|
|
+ free(mb_queue);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int hash_mb_init_poll_queue(struct hash_mb_poll_queue *poll_queue)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = pthread_spin_init(&poll_queue->s_lock, PTHREAD_PROCESS_SHARED);
|
|
+ if (ret) {
|
|
+ WD_ERR("failed to init s_lock!\n");
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ poll_queue->head = NULL;
|
|
+ poll_queue->tail = NULL;
|
|
+ poll_queue->job_num = 0;
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static int hash_mb_queue_init(struct wd_ctx_config_internal *config)
|
|
+{
|
|
+ struct hash_mb_queue *mb_queue;
|
|
+ int ctx_num = config->ctx_num;
|
|
+ struct wd_soft_ctx *ctx;
|
|
+ int i, ret;
|
|
+
|
|
+ for (i = 0; i < ctx_num; i++) {
|
|
+ mb_queue = calloc(1, sizeof(struct hash_mb_queue));
|
|
+ if (!mb_queue) {
|
|
+ ret = -WD_ENOMEM;
|
|
+ goto free_mb_queue;
|
|
+ }
|
|
+
|
|
+ mb_queue->ctx_mode = config->ctxs[i].ctx_mode;
|
|
+ ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx;
|
|
+ ctx->priv = mb_queue;
|
|
+ ret = hash_mb_init_poll_queue(&mb_queue->sm3_poll_queue);
|
|
+ if (ret)
|
|
+ goto free_mem;
|
|
+
|
|
+ ret = hash_mb_init_poll_queue(&mb_queue->md5_poll_queue);
|
|
+ if (ret)
|
|
+ goto uninit_sm3_poll;
|
|
+
|
|
+ ret = pthread_spin_init(&mb_queue->r_lock, PTHREAD_PROCESS_SHARED);
|
|
+ if (ret) {
|
|
+ WD_ERR("failed to init r_lock!\n");
|
|
+ goto uninit_md5_poll;
|
|
+ }
|
|
+
|
|
+ mb_queue->sm3_poll_queue.ops = &sm3_ops;
|
|
+ mb_queue->md5_poll_queue.ops = &md5_ops;
|
|
+ mb_queue->recv_head = NULL;
|
|
+ mb_queue->recv_tail = NULL;
|
|
+ mb_queue->complete_cnt = 0;
|
|
+ }
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+
|
|
+uninit_md5_poll:
|
|
+ hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue);
|
|
+uninit_sm3_poll:
|
|
+ hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue);
|
|
+free_mem:
|
|
+ free(mb_queue);
|
|
+free_mb_queue:
|
|
+ hash_mb_queue_uninit(config, i);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int hash_mb_init(struct wd_alg_driver *drv, void *conf)
|
|
+{
|
|
+ struct wd_ctx_config_internal *config = conf;
|
|
+ struct hash_mb_ctx *priv;
|
|
+ int ret;
|
|
+
|
|
+ priv = malloc(sizeof(struct hash_mb_ctx));
|
|
+ if (!priv)
|
|
+ return -WD_ENOMEM;
|
|
+
|
|
+ /* multibuff does not use epoll. */
|
|
+ config->epoll_en = 0;
|
|
+ memcpy(&priv->config, config, sizeof(struct wd_ctx_config_internal));
|
|
+
|
|
+ ret = hash_mb_queue_init(config);
|
|
+ if (ret) {
|
|
+ free(priv);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ drv->priv = priv;
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static void hash_mb_exit(struct wd_alg_driver *drv)
|
|
+{
|
|
+ struct hash_mb_ctx *priv = (struct hash_mb_ctx *)drv->priv;
|
|
+
|
|
+ if (!priv)
|
|
+ return;
|
|
+
|
|
+ hash_mb_queue_uninit(&priv->config, priv->config.ctx_num);
|
|
+ free(priv);
|
|
+ drv->priv = NULL;
|
|
+}
|
|
+
|
|
+static void hash_mb_pad_data(struct hash_pad *hash_pad, __u8 *in, __u32 partial,
|
|
+ __u64 total_len, bool transfer)
|
|
+{
|
|
+ __u64 size = total_len << BYTES_TO_BITS_OFFSET;
|
|
+ __u8 *buffer = hash_pad->pad;
|
|
+
|
|
+ if (partial)
|
|
+ memcpy(buffer, in, partial);
|
|
+
|
|
+ buffer[partial++] = 0x80;
|
|
+ if (partial <= HASH_PADLENGTHFIELD_SIZE) {
|
|
+ memset(buffer + partial, 0, HASH_PADLENGTHFIELD_SIZE - partial);
|
|
+ if (transfer) {
|
|
+ PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE, size >> HASH_HIGH_32BITS);
|
|
+ PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE + sizeof(__u32), size);
|
|
+ } else {
|
|
+ memcpy(buffer + HASH_PADLENGTHFIELD_SIZE, &size, sizeof(__u64));
|
|
+ }
|
|
+ hash_pad->pad_len = 1;
|
|
+ } else {
|
|
+ memset(buffer + partial, 0, HASH_PADDING_SIZE - partial);
|
|
+ if (transfer) {
|
|
+ PUTU32(buffer + HASH_PADDING_SIZE, size >> HASH_HIGH_32BITS);
|
|
+ PUTU32(buffer + HASH_PADDING_SIZE + sizeof(__u32), size);
|
|
+ } else {
|
|
+ memcpy(buffer + HASH_PADDING_SIZE, &size, sizeof(__u64));
|
|
+ }
|
|
+ hash_pad->pad_len = HASH_PADDING_BLOCKS;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void hash_xor(__u8 *key_out, __u8 *key_in, __u32 key_len, __u8 xor_value)
|
|
+{
|
|
+ __u32 i;
|
|
+
|
|
+ for (i = 0; i < HASH_KEY_LEN; i++) {
|
|
+ if (i < key_len)
|
|
+ key_out[i] = key_in[i] ^ xor_value;
|
|
+ else
|
|
+ key_out[i] = xor_value;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int hash_middle_block_process(struct hash_mb_poll_queue *poll_queue,
|
|
+ struct wd_digest_msg *d_msg,
|
|
+ struct hash_job *job)
|
|
+{
|
|
+ __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes;
|
|
+ __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes;
|
|
+
|
|
+ if (length < HASH_BLOCK_SIZE) {
|
|
+ memcpy(buffer, d_msg->in, d_msg->in_bytes);
|
|
+ d_msg->partial_bytes = length;
|
|
+ return -WD_EAGAIN;
|
|
+ }
|
|
+
|
|
+ if (d_msg->partial_bytes) {
|
|
+ memcpy(buffer, d_msg->in, HASH_BLOCK_SIZE - d_msg->partial_bytes);
|
|
+ job->buffer = d_msg->partial_block;
|
|
+ poll_queue->ops->asimd_x1(job, 1);
|
|
+ length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes);
|
|
+ buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes);
|
|
+ } else {
|
|
+ buffer = d_msg->in;
|
|
+ }
|
|
+
|
|
+ job->len = length >> HASH_BLOCK_OFFSET;
|
|
+ d_msg->partial_bytes = length & (HASH_BLOCK_SIZE - 1);
|
|
+ if (d_msg->partial_bytes)
|
|
+ memcpy(d_msg->partial_block, buffer + (job->len << HASH_BLOCK_OFFSET),
|
|
+ d_msg->partial_bytes);
|
|
+
|
|
+ if (!job->len) {
|
|
+ memcpy(d_msg->out, job->result_digest, poll_queue->ops->iv_bytes);
|
|
+ return -WD_EAGAIN;
|
|
+ }
|
|
+
|
|
+ job->buffer = buffer;
|
|
+ job->pad.pad_len = 0;
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static void hash_signle_block_process(struct wd_digest_msg *d_msg,
|
|
+ struct hash_job *job, __u64 total_len)
|
|
+{
|
|
+ __u32 hash_partial = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1);
|
|
+ __u8 *buffer;
|
|
+
|
|
+ job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET;
|
|
+ buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET);
|
|
+ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer);
|
|
+ if (!job->len) {
|
|
+ job->buffer = job->pad.pad;
|
|
+ job->len = job->pad.pad_len;
|
|
+ job->pad.pad_len = 0;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ job->buffer = d_msg->in;
|
|
+}
|
|
+
|
|
+static void hash_final_block_process(struct hash_mb_poll_queue *poll_queue,
|
|
+ struct wd_digest_msg *d_msg,
|
|
+ struct hash_job *job)
|
|
+{
|
|
+ __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes;
|
|
+ __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes;
|
|
+ __u32 hash_partial = length & (HASH_BLOCK_SIZE - 1);
|
|
+ __u64 total_len = d_msg->long_data_len;
|
|
+
|
|
+ if (job->opad.opad_size)
|
|
+ total_len += HASH_BLOCK_SIZE;
|
|
+
|
|
+ if (!d_msg->partial_bytes) {
|
|
+ hash_signle_block_process(d_msg, job, total_len);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (length <= HASH_BLOCK_SIZE) {
|
|
+ memcpy(buffer, d_msg->in, d_msg->in_bytes);
|
|
+ job->len = length >> HASH_BLOCK_OFFSET;
|
|
+ buffer = d_msg->partial_block + (job->len << HASH_BLOCK_OFFSET);
|
|
+ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer);
|
|
+ if (!job->len) {
|
|
+ job->buffer = job->pad.pad;
|
|
+ job->len = job->pad.pad_len;
|
|
+ job->pad.pad_len = 0;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ job->buffer = d_msg->partial_block;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ memcpy(buffer, d_msg->in, (HASH_BLOCK_SIZE - d_msg->partial_bytes));
|
|
+ job->buffer = d_msg->partial_block;
|
|
+ poll_queue->ops->asimd_x1(job, 1);
|
|
+ job->buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes);
|
|
+ length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes);
|
|
+ job->len = length >> HASH_BLOCK_OFFSET;
|
|
+ buffer = job->buffer + (job->len << HASH_BLOCK_OFFSET);
|
|
+ hash_partial = length & (HASH_BLOCK_SIZE - 1);
|
|
+ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer);
|
|
+ if (!job->len) {
|
|
+ job->buffer = job->pad.pad;
|
|
+ job->len = job->pad.pad_len;
|
|
+ job->pad.pad_len = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int hash_first_block_process(struct wd_digest_msg *d_msg,
|
|
+ struct hash_job *job,
|
|
+ __u32 iv_bytes)
|
|
+{
|
|
+ __u8 *buffer;
|
|
+
|
|
+ job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET;
|
|
+ d_msg->partial_bytes = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1);
|
|
+ if (d_msg->partial_bytes) {
|
|
+ buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET);
|
|
+ memcpy(d_msg->partial_block, buffer, d_msg->partial_bytes);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Long hash mode, if first block is less than HASH_BLOCK_SIZE,
|
|
+ * copy ikey hash result to out.
|
|
+ */
|
|
+ if (!job->len) {
|
|
+ memcpy(d_msg->out, job->result_digest, iv_bytes);
|
|
+ return -WD_EAGAIN;
|
|
+ }
|
|
+ job->buffer = d_msg->in;
|
|
+ job->pad.pad_len = 0;
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static int hash_do_partial(struct hash_mb_poll_queue *poll_queue,
|
|
+ struct wd_digest_msg *d_msg, struct hash_job *job)
|
|
+{
|
|
+ enum hash_block_type bd_type = get_hash_block_type(d_msg);
|
|
+ __u64 total_len = d_msg->in_bytes;
|
|
+ int ret = WD_SUCCESS;
|
|
+
|
|
+ switch (bd_type) {
|
|
+ case HASH_FIRST_BLOCK:
|
|
+ ret = hash_first_block_process(d_msg, job, poll_queue->ops->iv_bytes);
|
|
+ break;
|
|
+ case HASH_MIDDLE_BLOCK:
|
|
+ ret = hash_middle_block_process(poll_queue, d_msg, job);
|
|
+ break;
|
|
+ case HASH_END_BLOCK:
|
|
+ hash_final_block_process(poll_queue, d_msg, job);
|
|
+ break;
|
|
+ case HASH_SINGLE_BLOCK:
|
|
+ if (job->opad.opad_size)
|
|
+ total_len += HASH_BLOCK_SIZE;
|
|
+ hash_signle_block_process(d_msg, job, total_len);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void hash_mb_init_iv(struct hash_mb_poll_queue *poll_queue,
|
|
+ struct wd_digest_msg *d_msg, struct hash_job *job)
|
|
+{
|
|
+ enum hash_block_type bd_type = get_hash_block_type(d_msg);
|
|
+ __u8 key_ipad[HASH_KEY_LEN];
|
|
+ __u8 key_opad[HASH_KEY_LEN];
|
|
+
|
|
+ job->opad.opad_size = 0;
|
|
+ switch (bd_type) {
|
|
+ case HASH_FIRST_BLOCK:
|
|
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
|
|
+ if (d_msg->mode != WD_DIGEST_HMAC)
|
|
+ return;
|
|
+
|
|
+ hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE);
|
|
+ job->buffer = key_ipad;
|
|
+ poll_queue->ops->asimd_x1(job, 1);
|
|
+ break;
|
|
+ case HASH_MIDDLE_BLOCK:
|
|
+ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes);
|
|
+ break;
|
|
+ case HASH_END_BLOCK:
|
|
+ if (d_msg->mode != WD_DIGEST_HMAC) {
|
|
+ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes);
|
|
+ return;
|
|
+ }
|
|
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
|
|
+ hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE);
|
|
+ job->buffer = key_opad;
|
|
+ poll_queue->ops->asimd_x1(job, 1);
|
|
+ memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes);
|
|
+ job->opad.opad_size = poll_queue->ops->iv_bytes;
|
|
+ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes);
|
|
+ break;
|
|
+ case HASH_SINGLE_BLOCK:
|
|
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
|
|
+ if (d_msg->mode != WD_DIGEST_HMAC)
|
|
+ return;
|
|
+
|
|
+ hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE);
|
|
+ hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE);
|
|
+ job->buffer = key_opad;
|
|
+ poll_queue->ops->asimd_x1(job, 1);
|
|
+ memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes);
|
|
+ job->opad.opad_size = poll_queue->ops->iv_bytes;
|
|
+ job->buffer = key_ipad;
|
|
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
|
|
+ poll_queue->ops->asimd_x1(job, 1);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void hash_do_sync(struct hash_mb_poll_queue *poll_queue, struct hash_job *job)
|
|
+{
|
|
+ __u32 iv_bytes = poll_queue->ops->iv_bytes;
|
|
+ __u32 length;
|
|
+
|
|
+ poll_queue->ops->asimd_x1(job, job->len);
|
|
+
|
|
+ if (job->pad.pad_len) {
|
|
+ job->buffer = job->pad.pad;
|
|
+ poll_queue->ops->asimd_x1(job, job->pad.pad_len);
|
|
+ }
|
|
+
|
|
+ if (job->opad.opad_size) {
|
|
+ job->buffer = job->opad.opad + job->opad.opad_size;
|
|
+ memcpy(job->buffer, job->result_digest, iv_bytes);
|
|
+ memcpy(job->result_digest, job->opad.opad, iv_bytes);
|
|
+ length = HASH_BLOCK_SIZE + iv_bytes;
|
|
+ hash_mb_pad_data(&job->pad, job->buffer, iv_bytes, length, job->is_transfer);
|
|
+ job->buffer = job->pad.pad;
|
|
+ poll_queue->ops->asimd_x1(job, job->pad.pad_len);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void hash_mb_add_job_tail(struct hash_mb_poll_queue *poll_queue, struct hash_job *job)
|
|
+{
|
|
+ pthread_spin_lock(&poll_queue->s_lock);
|
|
+ if (poll_queue->job_num) {
|
|
+ poll_queue->tail->next = job;
|
|
+ poll_queue->tail = job;
|
|
+ } else {
|
|
+ poll_queue->head = job;
|
|
+ poll_queue->tail = job;
|
|
+ }
|
|
+ poll_queue->job_num++;
|
|
+ pthread_spin_unlock(&poll_queue->s_lock);
|
|
+}
|
|
+
|
|
+static void hash_mb_add_job_head(struct hash_mb_poll_queue *poll_queue, struct hash_job *job)
|
|
+{
|
|
+ pthread_spin_lock(&poll_queue->s_lock);
|
|
+ if (poll_queue->job_num) {
|
|
+ job->next = poll_queue->head;
|
|
+ poll_queue->head = job;
|
|
+ } else {
|
|
+ poll_queue->head = job;
|
|
+ poll_queue->tail = job;
|
|
+ }
|
|
+ poll_queue->job_num++;
|
|
+ pthread_spin_unlock(&poll_queue->s_lock);
|
|
+}
|
|
+
|
|
+static int hash_mb_check_param(struct hash_mb_queue *mb_queue, struct wd_digest_msg *d_msg)
|
|
+{
|
|
+ if (unlikely(mb_queue->ctx_mode == CTX_MODE_ASYNC && d_msg->has_next)) {
|
|
+ WD_ERR("invalid: async mode not supports long hash!\n");
|
|
+ return -WD_EINVAL;
|
|
+ }
|
|
+
|
|
+ if (unlikely(d_msg->data_fmt != WD_FLAT_BUF)) {
|
|
+ WD_ERR("invalid: hash multibuffer not supports sgl mode!\n");
|
|
+ return -WD_EINVAL;
|
|
+ }
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static int hash_mb_send(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg)
|
|
+{
|
|
+ struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx;
|
|
+ struct hash_mb_queue *mb_queue = s_ctx->priv;
|
|
+ struct wd_digest_msg *d_msg = drv_msg;
|
|
+ struct hash_mb_poll_queue *poll_queue;
|
|
+ struct hash_job hash_sync_job;
|
|
+ struct hash_job *hash_job;
|
|
+ int ret;
|
|
+
|
|
+ ret = hash_mb_check_param(mb_queue, d_msg);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (mb_queue->ctx_mode == CTX_MODE_ASYNC) {
|
|
+ hash_job = malloc(sizeof(struct hash_job));
|
|
+ if (unlikely(!hash_job))
|
|
+ return -WD_ENOMEM;
|
|
+ } else {
|
|
+ hash_job = &hash_sync_job;
|
|
+ }
|
|
+
|
|
+ switch (d_msg->alg) {
|
|
+ case WD_DIGEST_SM3:
|
|
+ poll_queue = &mb_queue->sm3_poll_queue;
|
|
+ hash_job->is_transfer = true;
|
|
+ break;
|
|
+ case WD_DIGEST_MD5:
|
|
+ poll_queue = &mb_queue->md5_poll_queue;
|
|
+ hash_job->is_transfer = false;
|
|
+ break;
|
|
+ default:
|
|
+ WD_ERR("invalid: alg type %u not support!\n", d_msg->alg);
|
|
+ if (mb_queue->ctx_mode == CTX_MODE_ASYNC)
|
|
+ free(hash_job);
|
|
+ return -WD_EINVAL;
|
|
+ }
|
|
+
|
|
+ hash_mb_init_iv(poll_queue, d_msg, hash_job);
|
|
+ /* If block not need process, return directly. */
|
|
+ ret = hash_do_partial(poll_queue, d_msg, hash_job);
|
|
+ if (ret == -WD_EAGAIN) {
|
|
+ if (mb_queue->ctx_mode == CTX_MODE_ASYNC)
|
|
+ free(hash_job);
|
|
+
|
|
+ d_msg->result = WD_SUCCESS;
|
|
+ return WD_SUCCESS;
|
|
+ }
|
|
+
|
|
+ if (mb_queue->ctx_mode == CTX_MODE_SYNC) {
|
|
+ hash_do_sync(poll_queue, hash_job);
|
|
+ memcpy(d_msg->out, hash_job->result_digest, d_msg->out_bytes);
|
|
+ d_msg->result = WD_SUCCESS;
|
|
+ return WD_SUCCESS;
|
|
+ }
|
|
+
|
|
+ hash_job->msg = d_msg;
|
|
+ hash_mb_add_job_tail(poll_queue, hash_job);
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static struct hash_job *hash_mb_find_complete_job(struct hash_mb_queue *mb_queue)
|
|
+{
|
|
+ struct hash_job *job;
|
|
+
|
|
+ pthread_spin_lock(&mb_queue->r_lock);
|
|
+ if (!mb_queue->complete_cnt) {
|
|
+ pthread_spin_unlock(&mb_queue->r_lock);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ job = mb_queue->recv_head;
|
|
+ mb_queue->recv_head = job->next;
|
|
+ mb_queue->complete_cnt--;
|
|
+ pthread_spin_unlock(&mb_queue->r_lock);
|
|
+
|
|
+ return job;
|
|
+}
|
|
+
|
|
+static int hash_recv_complete_job(struct hash_mb_queue *mb_queue, struct wd_digest_msg *msg)
|
|
+{
|
|
+ struct hash_mb_poll_queue *poll_queue;
|
|
+ struct hash_job *hash_job;
|
|
+ __u32 total_len;
|
|
+
|
|
+ hash_job = hash_mb_find_complete_job(mb_queue);
|
|
+ if (!hash_job)
|
|
+ return -WD_EAGAIN;
|
|
+
|
|
+ if (!hash_job->opad.opad_size) {
|
|
+ msg->tag = hash_job->msg->tag;
|
|
+ memcpy(hash_job->msg->out, hash_job->result_digest, hash_job->msg->out_bytes);
|
|
+ free(hash_job);
|
|
+ msg->result = WD_SUCCESS;
|
|
+ return WD_SUCCESS;
|
|
+ }
|
|
+
|
|
+ if (hash_job->msg->alg == WD_DIGEST_SM3)
|
|
+ poll_queue = &mb_queue->sm3_poll_queue;
|
|
+ else
|
|
+ poll_queue = &mb_queue->md5_poll_queue;
|
|
+ hash_job->buffer = hash_job->opad.opad + poll_queue->ops->iv_bytes;
|
|
+ memcpy(hash_job->buffer, hash_job->result_digest, poll_queue->ops->iv_bytes);
|
|
+ total_len = poll_queue->ops->iv_bytes + HASH_BLOCK_SIZE;
|
|
+ hash_mb_pad_data(&hash_job->pad, hash_job->buffer, poll_queue->ops->iv_bytes,
|
|
+ total_len, hash_job->is_transfer);
|
|
+ memcpy(hash_job->result_digest, hash_job->opad.opad, poll_queue->ops->iv_bytes);
|
|
+ hash_job->opad.opad_size = 0;
|
|
+ hash_job->buffer = hash_job->pad.pad;
|
|
+ hash_job->len = hash_job->pad.pad_len;
|
|
+ hash_job->pad.pad_len = 0;
|
|
+
|
|
+ hash_mb_add_job_head(poll_queue, hash_job);
|
|
+
|
|
+ return -WD_EAGAIN;
|
|
+}
|
|
+
|
|
+static struct hash_job *hash_mb_get_job(struct hash_mb_poll_queue *poll_queue)
|
|
+{
|
|
+ struct hash_job *job;
|
|
+
|
|
+ pthread_spin_lock(&poll_queue->s_lock);
|
|
+ if (!poll_queue->job_num) {
|
|
+ pthread_spin_unlock(&poll_queue->s_lock);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ job = poll_queue->head;
|
|
+ poll_queue->head = job->next;
|
|
+ poll_queue->job_num--;
|
|
+ pthread_spin_unlock(&poll_queue->s_lock);
|
|
+
|
|
+ return job;
|
|
+}
|
|
+
|
|
+static void hash_mb_add_finish_job(struct hash_mb_queue *mb_queue, struct hash_job *job)
|
|
+{
|
|
+ pthread_spin_lock(&mb_queue->r_lock);
|
|
+ if (mb_queue->complete_cnt) {
|
|
+ mb_queue->recv_tail->next = job;
|
|
+ mb_queue->recv_tail = job;
|
|
+ } else {
|
|
+ mb_queue->recv_head = job;
|
|
+ mb_queue->recv_tail = job;
|
|
+ }
|
|
+ mb_queue->complete_cnt++;
|
|
+ pthread_spin_unlock(&mb_queue->r_lock);
|
|
+}
|
|
+
|
|
+static struct hash_mb_poll_queue *hash_get_poll_queue(struct hash_mb_queue *mb_queue)
|
|
+{
|
|
+ if (!mb_queue->sm3_poll_queue.job_num &&
|
|
+ !mb_queue->md5_poll_queue.job_num)
|
|
+ return NULL;
|
|
+
|
|
+ if (mb_queue->md5_poll_queue.job_num >= mb_queue->sm3_poll_queue.job_num)
|
|
+ return &mb_queue->md5_poll_queue;
|
|
+
|
|
+ return &mb_queue->sm3_poll_queue;
|
|
+}
|
|
+
|
|
+static int hash_mb_do_jobs(struct hash_mb_queue *mb_queue)
|
|
+{
|
|
+ struct hash_mb_poll_queue *poll_queue = hash_get_poll_queue(mb_queue);
|
|
+ struct hash_job *job_vecs[HASH_MAX_LANES];
|
|
+ __u64 len = 0;
|
|
+ int maxjobs;
|
|
+ int j = 0;
|
|
+ int i = 0;
|
|
+
|
|
+ if (!poll_queue)
|
|
+ return -WD_EAGAIN;
|
|
+
|
|
+ maxjobs = poll_queue->ops->max_lanes();
|
|
+ maxjobs = MIN(maxjobs, poll_queue->ops->max_jobs);
|
|
+ while (j < maxjobs) {
|
|
+ job_vecs[j] = hash_mb_get_job(poll_queue);
|
|
+ if (!job_vecs[j])
|
|
+ break;
|
|
+
|
|
+ if (!j)
|
|
+ len = job_vecs[j]->len;
|
|
+ else
|
|
+ len = MIN(job_vecs[j]->len, len);
|
|
+ j++;
|
|
+ }
|
|
+
|
|
+ if (!j)
|
|
+ return -WD_EAGAIN;
|
|
+
|
|
+ if (j > HASH_NENO_PROCESS_JOBS) {
|
|
+ poll_queue->ops->sve(len, j, job_vecs);
|
|
+ } else if (j == HASH_NENO_PROCESS_JOBS) {
|
|
+ poll_queue->ops->asimd_x4(job_vecs[0], job_vecs[1],
|
|
+ job_vecs[2], job_vecs[3], len);
|
|
+ } else {
|
|
+ while (i < j)
|
|
+ poll_queue->ops->asimd_x1(job_vecs[i++], len);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < j; i++) {
|
|
+ if (job_vecs[i]->len == len) {
|
|
+ if (!job_vecs[i]->pad.pad_len) {
|
|
+ hash_mb_add_finish_job(mb_queue, job_vecs[i]);
|
|
+ } else {
|
|
+ job_vecs[i]->buffer = job_vecs[i]->pad.pad;
|
|
+ job_vecs[i]->len = job_vecs[i]->pad.pad_len;
|
|
+ job_vecs[i]->pad.pad_len = 0;
|
|
+ hash_mb_add_job_head(poll_queue, job_vecs[i]);
|
|
+ }
|
|
+ } else {
|
|
+ job_vecs[i]->len -= len;
|
|
+ job_vecs[i]->buffer += len << HASH_BLOCK_OFFSET;
|
|
+ hash_mb_add_job_head(poll_queue, job_vecs[i]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return WD_SUCCESS;
|
|
+}
|
|
+
|
|
+static int hash_mb_recv(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg)
|
|
+{
|
|
+ struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx;
|
|
+ struct hash_mb_queue *mb_queue = s_ctx->priv;
|
|
+ struct wd_digest_msg *msg = drv_msg;
|
|
+ int ret, i = 0;
|
|
+
|
|
+ if (mb_queue->ctx_mode == CTX_MODE_SYNC)
|
|
+ return WD_SUCCESS;
|
|
+
|
|
+ while (i++ < HASH_TRY_PROCESS_COUNT) {
|
|
+ ret = hash_recv_complete_job(mb_queue, msg);
|
|
+ if (!ret)
|
|
+ return WD_SUCCESS;
|
|
+
|
|
+ ret = hash_mb_do_jobs(mb_queue);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return -WD_EAGAIN;
|
|
+}
|
|
+
|
|
+static int hash_mb_get_usage(void *param)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define GEN_HASH_ALG_DRIVER(hash_alg_name) \
|
|
+{\
|
|
+ .drv_name = "hash_mb",\
|
|
+ .alg_name = (hash_alg_name),\
|
|
+ .calc_type = UADK_ALG_SVE_INSTR,\
|
|
+ .priority = 100,\
|
|
+ .queue_num = 1,\
|
|
+ .op_type_num = 1,\
|
|
+ .fallback = 0,\
|
|
+ .init = hash_mb_init,\
|
|
+ .exit = hash_mb_exit,\
|
|
+ .send = hash_mb_send,\
|
|
+ .recv = hash_mb_recv,\
|
|
+ .get_usage = hash_mb_get_usage,\
|
|
+}
|
|
+
|
|
+static struct wd_alg_driver hash_mb_driver[] = {
|
|
+ GEN_HASH_ALG_DRIVER("sm3"),
|
|
+ GEN_HASH_ALG_DRIVER("md5"),
|
|
+};
|
|
+
|
|
+static void __attribute__((constructor)) hash_mb_probe(void)
|
|
+{
|
|
+ size_t alg_num = ARRAY_SIZE(hash_mb_driver);
|
|
+ size_t i;
|
|
+ int ret;
|
|
+
|
|
+ WD_INFO("Info: register hash_mb alg drivers!\n");
|
|
+ for (i = 0; i < alg_num; i++) {
|
|
+ ret = wd_alg_driver_register(&hash_mb_driver[i]);
|
|
+ if (ret && ret != -WD_ENODEV)
|
|
+ WD_ERR("Error: register hash multibuff %s failed!\n",
|
|
+ hash_mb_driver[i].alg_name);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void __attribute__((destructor)) hash_mb_remove(void)
|
|
+{
|
|
+ size_t alg_num = ARRAY_SIZE(hash_mb_driver);
|
|
+ size_t i;
|
|
+
|
|
+ WD_INFO("Info: unregister hash_mb alg drivers!\n");
|
|
+ for (i = 0; i < alg_num; i++)
|
|
+ wd_alg_driver_unregister(&hash_mb_driver[i]);
|
|
+}
|
|
+
|
|
diff --git a/drv/hash_mb/hash_mb.h b/drv/hash_mb/hash_mb.h
|
|
new file mode 100644
|
|
index 0000000..aba5ec9
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/hash_mb.h
|
|
@@ -0,0 +1,62 @@
|
|
+/* SPDX-License-Identifier: Apache-2.0 */
|
|
+/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */
|
|
+
|
|
+#ifndef __HASH_MB_H
|
|
+#define __HASH_MB_H
|
|
+
|
|
+#include <stdbool.h>
|
|
+#include <stdint.h>
|
|
+#include "drv/wd_digest_drv.h"
|
|
+#include "wd_digest.h"
|
|
+
|
|
+#ifdef __cplusplus
|
|
+extern "C" {
|
|
+#endif
|
|
+
|
|
+#define HASH_BLOCK_SIZE 64
|
|
+#define HASH_DIGEST_NWORDS 32
|
|
+
|
|
+#if __STDC_VERSION__ >= 201112L
|
|
+# define __ALIGN_END __attribute__((aligned(64)))
|
|
+#else
|
|
+# define __ALIGN_END __aligned(64)
|
|
+#endif
|
|
+
|
|
+struct hash_pad {
|
|
+ __u8 pad[HASH_BLOCK_SIZE * 2];
|
|
+ __u32 pad_len;
|
|
+};
|
|
+
|
|
+struct hash_opad {
|
|
+ __u8 opad[HASH_BLOCK_SIZE];
|
|
+ __u32 opad_size;
|
|
+};
|
|
+
|
|
+struct hash_job {
|
|
+ void *buffer;
|
|
+ __u64 len;
|
|
+ __u8 result_digest[HASH_DIGEST_NWORDS] __ALIGN_END;
|
|
+ struct hash_pad pad;
|
|
+ struct hash_opad opad;
|
|
+ struct hash_job *next;
|
|
+ struct wd_digest_msg *msg;
|
|
+ bool is_transfer;
|
|
+};
|
|
+
|
|
+void sm3_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec);
|
|
+void sm3_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2,
|
|
+ struct hash_job *job3, struct hash_job *job4, int len);
|
|
+void sm3_mb_asimd_x1(struct hash_job *job, int len);
|
|
+int sm3_mb_sve_max_lanes(void);
|
|
+void md5_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec);
|
|
+void md5_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2,
|
|
+ struct hash_job *job3, struct hash_job *job4, int len);
|
|
+void md5_mb_asimd_x1(struct hash_job *job, int len);
|
|
+int md5_mb_sve_max_lanes(void);
|
|
+
|
|
+#ifdef __cplusplus
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif /* __HASH_MB_H */
|
|
+
|
|
diff --git a/drv/hash_mb/md5_mb_asimd_x1.S b/drv/hash_mb/md5_mb_asimd_x1.S
|
|
new file mode 100644
|
|
index 0000000..27d1124
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/md5_mb_asimd_x1.S
|
|
@@ -0,0 +1,248 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2020 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ .arch armv8-a
|
|
+
|
|
+/*
|
|
+Macros
|
|
+*/
|
|
+
|
|
+.macro declare_var_vector_reg name:req,reg:req
|
|
+ q_\name .req q\reg
|
|
+ v_\name .req v\reg
|
|
+ s_\name .req s\reg
|
|
+.endm
|
|
+
|
|
+
|
|
+.macro round_0_15 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
|
|
+ eor tmp0,\d_c,\d_d
|
|
+ mov k,\kl
|
|
+ and tmp0,tmp0,\d_b
|
|
+ movk k,\kh,lsl 16
|
|
+ eor tmp0,tmp0,\d_d
|
|
+ add tmp1,k,\w
|
|
+ add tmp0,tmp1,tmp0
|
|
+ add tmp0,\d_a,tmp0
|
|
+ ror tmp0,tmp0,32 - \r
|
|
+ add \d_a,\d_b,tmp0
|
|
+.endm
|
|
+
|
|
+.macro round_16_31 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
|
|
+ eor tmp0,\d_b,\d_c
|
|
+ mov k,\kl
|
|
+ and tmp0,tmp0,\d_d
|
|
+ movk k,\kh,lsl 16
|
|
+ eor tmp0,tmp0,\d_c
|
|
+ add tmp1,k,\w
|
|
+ add tmp0,tmp1,tmp0
|
|
+ add tmp0,\d_a,tmp0
|
|
+ ror tmp0,tmp0,32 - \r
|
|
+ add \d_a,\d_b,tmp0
|
|
+.endm
|
|
+
|
|
+.macro round_32_47 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
|
|
+ eor tmp0,\d_b,\d_c
|
|
+ mov k,\kl
|
|
+ eor tmp0,tmp0,\d_d
|
|
+ movk k,\kh,lsl 16
|
|
+ add tmp1,k,\w
|
|
+ add tmp0,tmp1,tmp0
|
|
+ add tmp0,\d_a,tmp0
|
|
+ ror tmp0,tmp0,32 - \r
|
|
+ add \d_a,\d_b,tmp0
|
|
+.endm
|
|
+
|
|
+.macro round_48_63 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
|
|
+ orn tmp0,\d_b,\d_d
|
|
+ mov k,\kl
|
|
+ eor tmp0,tmp0,\d_c
|
|
+ movk k,\kh,lsl 16
|
|
+ add tmp1,k,\w
|
|
+ add tmp0,tmp1,tmp0
|
|
+ add tmp0,\d_a,tmp0
|
|
+ ror tmp0,tmp0,32 - \r
|
|
+ add \d_a,\d_b,tmp0
|
|
+.endm
|
|
+/*
|
|
+ variables
|
|
+*/
|
|
+ job0 .req x0
|
|
+ digest_addr .req x0
|
|
+ len .req w1
|
|
+ end .req x1
|
|
+
|
|
+ buf_adr .req x2
|
|
+ d_a .req w3
|
|
+ d_b .req w4
|
|
+ d_c .req w5
|
|
+ d_d .req w6
|
|
+ k .req w7
|
|
+ m0 .req w8
|
|
+ m1 .req w9
|
|
+ m2 .req w10
|
|
+ m3 .req w11
|
|
+ m4 .req w12
|
|
+ m5 .req w13
|
|
+ m6 .req w14
|
|
+ m7 .req w15
|
|
+ m8 .req w19
|
|
+ m9 .req w20
|
|
+ m10 .req w21
|
|
+ m11 .req w22
|
|
+ m12 .req w23
|
|
+ m13 .req w24
|
|
+ m14 .req w25
|
|
+ m15 .req w26
|
|
+
|
|
+ tmp0 .req w27
|
|
+ tmp1 .req w28
|
|
+
|
|
+ d_a1 .req w8
|
|
+ d_b1 .req w9
|
|
+ d_c1 .req w15
|
|
+ d_d1 .req w19
|
|
+
|
|
+/*
|
|
+ void md5_mb_asimd_x1(MD5_JOB * job0,int len)
|
|
+*/
|
|
+ .global md5_mb_asimd_x1
|
|
+ .type md5_mb_asimd_x1, %function
|
|
+md5_mb_asimd_x1:
|
|
+ cmp len,0
|
|
+ stp x29, x30, [sp,-96]!
|
|
+ ldr buf_adr,[job0],64
|
|
+ stp x19, x20, [sp, 16]
|
|
+ add end,buf_adr,end,lsl 6
|
|
+ stp x21, x22, [sp, 32]
|
|
+ ldp d_a,d_b,[digest_addr]
|
|
+ stp x23, x24, [sp, 48]
|
|
+ ldp d_c,d_d,[digest_addr,8]
|
|
+ stp x25, x26, [sp, 64]
|
|
+ stp x27, x28, [sp, 80]
|
|
+ ble .exit
|
|
+
|
|
+.loop_start:
|
|
+ ldp m0,m1,[buf_adr],8
|
|
+ ldp m2,m3,[buf_adr],8
|
|
+ round_0_15 d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7
|
|
+
|
|
+ ldp m4,m5,[buf_adr],8
|
|
+ round_0_15 d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12
|
|
+ ldp m6,m7,[buf_adr],8
|
|
+ round_0_15 d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17
|
|
+ ldp m8,m9,[buf_adr],8
|
|
+ round_0_15 d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22
|
|
+ ldp m10,m11,[buf_adr],8
|
|
+ round_0_15 d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7
|
|
+ ldp m12,m13,[buf_adr],8
|
|
+ round_0_15 d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12
|
|
+ ldp m14,m15,[buf_adr],8
|
|
+ round_0_15 d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17
|
|
+ round_0_15 d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22
|
|
+ round_0_15 d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7
|
|
+ round_0_15 d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12
|
|
+ round_0_15 d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17
|
|
+ round_0_15 d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22
|
|
+ round_0_15 d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7
|
|
+ round_0_15 d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12
|
|
+ round_0_15 d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17
|
|
+ round_0_15 d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22
|
|
+
|
|
+ round_16_31 d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5
|
|
+ round_16_31 d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9
|
|
+ round_16_31 d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14
|
|
+ round_16_31 d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20
|
|
+ round_16_31 d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5
|
|
+ round_16_31 d_d,d_a,d_b,d_c,0x244,0x1453,m10,9
|
|
+ round_16_31 d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14
|
|
+ round_16_31 d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20
|
|
+ round_16_31 d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5
|
|
+ round_16_31 d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9
|
|
+ round_16_31 d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14
|
|
+ round_16_31 d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20
|
|
+ round_16_31 d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5
|
|
+ round_16_31 d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9
|
|
+ round_16_31 d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14
|
|
+ round_16_31 d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20
|
|
+
|
|
+ round_32_47 d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4
|
|
+ round_32_47 d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11
|
|
+ round_32_47 d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16
|
|
+ round_32_47 d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23
|
|
+ round_32_47 d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4
|
|
+ round_32_47 d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11
|
|
+ round_32_47 d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16
|
|
+ round_32_47 d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23
|
|
+ round_32_47 d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4
|
|
+ round_32_47 d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11
|
|
+ round_32_47 d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16
|
|
+ round_32_47 d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23
|
|
+ round_32_47 d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4
|
|
+ round_32_47 d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11
|
|
+ round_32_47 d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16
|
|
+ round_32_47 d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23
|
|
+
|
|
+ round_48_63 d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6
|
|
+ round_48_63 d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10
|
|
+ round_48_63 d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15
|
|
+ round_48_63 d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21
|
|
+ round_48_63 d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6
|
|
+ round_48_63 d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10
|
|
+ round_48_63 d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15
|
|
+ round_48_63 d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21
|
|
+ round_48_63 d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6
|
|
+ round_48_63 d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10
|
|
+ round_48_63 d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15
|
|
+ round_48_63 d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21
|
|
+ round_48_63 d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6
|
|
+ ldp d_a1,d_b1,[digest_addr]
|
|
+ round_48_63 d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10
|
|
+ ldp d_c1,d_d1,[digest_addr,8]
|
|
+ round_48_63 d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15
|
|
+ round_48_63 d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21
|
|
+
|
|
+ cmp buf_adr,end
|
|
+ add d_a,d_a1 ,d_a
|
|
+ str d_a,[digest_addr]
|
|
+ add d_b,d_b1 ,d_b
|
|
+ str d_b,[digest_addr,4]
|
|
+ add d_c,d_c1 ,d_c
|
|
+ str d_c,[digest_addr,8]
|
|
+ add d_d,d_d1 ,d_d
|
|
+ str d_d,[digest_addr,12]
|
|
+ bne .loop_start
|
|
+
|
|
+.exit:
|
|
+ ldp x19, x20, [sp, 16]
|
|
+ ldp x21, x22, [sp, 32]
|
|
+ ldp x23, x24, [sp, 48]
|
|
+ ldp x25, x26, [sp, 64]
|
|
+ ldp x27, x28, [sp, 80]
|
|
+ ldp x29, x30, [sp], 96
|
|
+ ret
|
|
+ .size md5_mb_asimd_x1, .-md5_mb_asimd_x1
|
|
diff --git a/drv/hash_mb/md5_mb_asimd_x4.S b/drv/hash_mb/md5_mb_asimd_x4.S
|
|
new file mode 100644
|
|
index 0000000..5397913
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/md5_mb_asimd_x4.S
|
|
@@ -0,0 +1,526 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2020 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ .arch armv8-a
|
|
+
|
|
+/*
|
|
+Macros
|
|
+*/
|
|
+
|
|
+.macro declare_var_vector_reg name:req,reg:req
|
|
+ q_\name .req q\reg
|
|
+ v_\name .req v\reg
|
|
+ s_\name .req s\reg
|
|
+.endm
|
|
+
|
|
+.macro add_key_rol a:req,b:req,k:req,w:req,r:req
|
|
+ add v_tmp0.4s,v_\k\().4s,v_\w\().4s
|
|
+ add v_tmp1.4s,v_tmp1.4s,v_\a\().4s
|
|
+ add v_tmp1.4s,v_tmp1.4s,v_tmp0.4s
|
|
+ shl v_tmp0.4s,v_tmp1.4s,\r
|
|
+ ushr v_tmp1.4s,v_tmp1.4s,32-\r
|
|
+ orr v_tmp0.16b,v_tmp1.16b,v_tmp0.16b
|
|
+
|
|
+ add v_\a\().4s,v_\b\().4s,v_tmp0.4s
|
|
+.endm
|
|
+.macro round_0_15 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
|
|
+ mov v_tmp1.16b, v_\b\().16b
|
|
+ bsl v_tmp1.16b, v_\c\().16b, v_\d\().16b
|
|
+ ldr q_\k1,[key_adr],16
|
|
+ add_key_rol \a,\b,\k,\w,\r
|
|
+.endm
|
|
+
|
|
+.macro round_16_31 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
|
|
+ mov v_tmp1.16b, v_\d\().16b
|
|
+ bsl v_tmp1.16b, v_\b\().16b, v_\c\().16b
|
|
+ ldr q_\k1,[key_adr],16
|
|
+ add_key_rol \a,\b,\k,\w,\r
|
|
+.endm
|
|
+
|
|
+.macro round_32_47 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
|
|
+ eor v_tmp1.16b,v_\b\().16b,v_\c\().16b
|
|
+ eor v_tmp1.16b,v_tmp1.16b,v_\d\().16b
|
|
+ ldr q_\k1,[key_adr],16
|
|
+ add_key_rol \a,\b,\k,\w,\r
|
|
+.endm
|
|
+
|
|
+.macro round_48_63 a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req
|
|
+ orn v_tmp1.16b,v_\b\().16b,v_\d\().16b
|
|
+ eor v_tmp1.16b,v_tmp1.16b,v_\c\().16b
|
|
+ .ifnb \k1
|
|
+ ldr q_\k1,[key_adr],16
|
|
+ .endif
|
|
+ add_key_rol \a,\b,\k,\w,\r
|
|
+.endm
|
|
+/*
|
|
+ variables
|
|
+*/
|
|
+ declare_var_vector_reg tmp0, 0
|
|
+ declare_var_vector_reg tmp1, 1
|
|
+ declare_var_vector_reg k, 2
|
|
+ declare_var_vector_reg k1, 3
|
|
+ declare_var_vector_reg a, 4
|
|
+ declare_var_vector_reg b, 5
|
|
+ declare_var_vector_reg c, 6
|
|
+ declare_var_vector_reg d, 7
|
|
+ declare_var_vector_reg a1, 8
|
|
+ declare_var_vector_reg b1, 9
|
|
+ declare_var_vector_reg c1, 10
|
|
+ declare_var_vector_reg d1, 11
|
|
+
|
|
+ declare_var_vector_reg w0, 16
|
|
+ declare_var_vector_reg w1, 17
|
|
+ declare_var_vector_reg w2, 18
|
|
+ declare_var_vector_reg w3, 19
|
|
+ declare_var_vector_reg w4, 20
|
|
+ declare_var_vector_reg w5, 21
|
|
+ declare_var_vector_reg w6, 22
|
|
+ declare_var_vector_reg w7, 23
|
|
+ declare_var_vector_reg w8, 24
|
|
+ declare_var_vector_reg w9, 25
|
|
+ declare_var_vector_reg w10, 26
|
|
+ declare_var_vector_reg w11, 27
|
|
+ declare_var_vector_reg w12, 28
|
|
+ declare_var_vector_reg w13, 29
|
|
+ declare_var_vector_reg w14, 30
|
|
+ declare_var_vector_reg w15, 31
|
|
+
|
|
+ len .req w4
|
|
+ len_x .req x4
|
|
+ lane0 .req x5
|
|
+ lane1 .req x6
|
|
+ lane2 .req x7
|
|
+ lane3 .req x9
|
|
+ end .req x4
|
|
+ job0 .req x0
|
|
+ job1 .req x1
|
|
+ job2 .req x2
|
|
+ job3 .req x3
|
|
+ key_adr .req x10
|
|
+
|
|
+/*
|
|
+ void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1,
|
|
+ MD5_JOB * job2, MD5_JOB * job3, int len)
|
|
+*/
|
|
+ .global md5_mb_asimd_x4
|
|
+ .type md5_mb_asimd_x4, %function
|
|
+md5_mb_asimd_x4:
|
|
+ stp x29,x30,[sp,-48]!
|
|
+ ldr lane0,[job0],64
|
|
+ stp d8,d9,[sp,16]
|
|
+ ldr lane1,[job1],64
|
|
+ stp d10,d11,[sp,32]
|
|
+ ldr lane2,[job2],64
|
|
+ cmp len,0
|
|
+ ldr lane3,[job3],64
|
|
+ ble .exit
|
|
+
|
|
+ //load digests
|
|
+ ld4 {v_a.s-v_d.s}[0],[job0]
|
|
+ add end,lane0,len_x,lsl 6
|
|
+ ld4 {v_a.s-v_d.s}[1],[job1]
|
|
+ ld4 {v_a.s-v_d.s}[2],[job2]
|
|
+ ld4 {v_a.s-v_d.s}[3],[job3]
|
|
+.loop_start:
|
|
+ ld1 {v_w0.s}[0],[lane0],4
|
|
+ mov v_a1.16b,v_a.16b
|
|
+ ld1 {v_w0.s}[1],[lane1],4
|
|
+ mov v_b1.16b,v_b.16b
|
|
+ ld1 {v_w0.s}[2],[lane2],4
|
|
+ mov v_c1.16b,v_c.16b
|
|
+ ld1 {v_w0.s}[3],[lane3],4
|
|
+ mov v_d1.16b,v_d.16b
|
|
+
|
|
+ ld3 {v_w1.s-v_w3.s}[0],[lane0],12
|
|
+ adrp key_adr,.key_consts
|
|
+ ld3 {v_w1.s-v_w3.s}[1],[lane1],12
|
|
+ add key_adr,key_adr,#:lo12:.key_consts
|
|
+ ld3 {v_w1.s-v_w3.s}[2],[lane2],12
|
|
+ ldr q_k,[key_adr],16
|
|
+ ld3 {v_w1.s-v_w3.s}[3],[lane3],12
|
|
+
|
|
+
|
|
+ ld4 {v_w4.s-v_w7.s}[0], [lane0],16
|
|
+
|
|
+ round_0_15 a,b,c,d,k,k1,w0,7
|
|
+
|
|
+ ld4 {v_w4.s-v_w7.s}[1], [lane1],16
|
|
+ round_0_15 d,a,b,c,k1,k,w1,12
|
|
+ ld4 {v_w4.s-v_w7.s}[2], [lane2],16
|
|
+ round_0_15 c,d,a,b,k,k1,w2,17
|
|
+ ld4 {v_w4.s-v_w7.s}[3], [lane3],16
|
|
+ round_0_15 b,c,d,a,k1,k,w3,22
|
|
+ ld4 {v_w8.s-v_w11.s}[0],[lane0],16
|
|
+ round_0_15 a,b,c,d,k,k1,w4,7
|
|
+ ld4 {v_w8.s-v_w11.s}[1],[lane1],16
|
|
+ round_0_15 d,a,b,c,k1,k,w5,12
|
|
+ ld4 {v_w8.s-v_w11.s}[2],[lane2],16
|
|
+ round_0_15 c,d,a,b,k,k1,w6,17
|
|
+ ld4 {v_w8.s-v_w11.s}[3],[lane3],16
|
|
+ round_0_15 b,c,d,a,k1,k,w7,22
|
|
+ ld4 {v_w12.s-v_w15.s}[0],[lane0],16
|
|
+ round_0_15 a,b,c,d,k,k1,w8,7
|
|
+ ld4 {v_w12.s-v_w15.s}[1],[lane1],16
|
|
+ round_0_15 d,a,b,c,k1,k,w9,12
|
|
+ ld4 {v_w12.s-v_w15.s}[2],[lane2],16
|
|
+ round_0_15 c,d,a,b,k,k1,w10,17
|
|
+ ld4 {v_w12.s-v_w15.s}[3],[lane3],16
|
|
+ round_0_15 b,c,d,a,k1,k,w11,22
|
|
+ round_0_15 a,b,c,d,k,k1,w12,7
|
|
+ round_0_15 d,a,b,c,k1,k,w13,12
|
|
+ round_0_15 c,d,a,b,k,k1,w14,17
|
|
+ round_0_15 b,c,d,a,k1,k,w15,22
|
|
+
|
|
+ round_16_31 a,b,c,d,k,k1,w1,5
|
|
+ round_16_31 d,a,b,c,k1,k,w6,9
|
|
+ round_16_31 c,d,a,b,k,k1,w11,14
|
|
+ round_16_31 b,c,d,a,k1,k,w0,20
|
|
+ round_16_31 a,b,c,d,k,k1,w5,5
|
|
+ round_16_31 d,a,b,c,k1,k,w10,9
|
|
+ round_16_31 c,d,a,b,k,k1,w15,14
|
|
+ round_16_31 b,c,d,a,k1,k,w4,20
|
|
+ round_16_31 a,b,c,d,k,k1,w9,5
|
|
+ round_16_31 d,a,b,c,k1,k,w14,9
|
|
+ round_16_31 c,d,a,b,k,k1,w3,14
|
|
+ round_16_31 b,c,d,a,k1,k,w8,20
|
|
+ round_16_31 a,b,c,d,k,k1,w13,5
|
|
+ round_16_31 d,a,b,c,k1,k,w2,9
|
|
+ round_16_31 c,d,a,b,k,k1,w7,14
|
|
+ round_16_31 b,c,d,a,k1,k,w12,20
|
|
+
|
|
+ round_32_47 a,b,c,d,k,k1,w5,4
|
|
+ round_32_47 d,a,b,c,k1,k,w8,11
|
|
+ round_32_47 c,d,a,b,k,k1,w11,16
|
|
+ round_32_47 b,c,d,a,k1,k,w14,23
|
|
+ round_32_47 a,b,c,d,k,k1,w1,4
|
|
+ round_32_47 d,a,b,c,k1,k,w4,11
|
|
+ round_32_47 c,d,a,b,k,k1,w7,16
|
|
+ round_32_47 b,c,d,a,k1,k,w10,23
|
|
+ round_32_47 a,b,c,d,k,k1,w13,4
|
|
+ round_32_47 d,a,b,c,k1,k,w0,11
|
|
+ round_32_47 c,d,a,b,k,k1,w3,16
|
|
+ round_32_47 b,c,d,a,k1,k,w6,23
|
|
+ round_32_47 a,b,c,d,k,k1,w9,4
|
|
+ round_32_47 d,a,b,c,k1,k,w12,11
|
|
+ round_32_47 c,d,a,b,k,k1,w15,16
|
|
+ round_32_47 b,c,d,a,k1,k,w2,23
|
|
+
|
|
+ round_48_63 a,b,c,d,k,k1,w0,6
|
|
+ round_48_63 d,a,b,c,k1,k,w7,10
|
|
+ round_48_63 c,d,a,b,k,k1,w14,15
|
|
+ round_48_63 b,c,d,a,k1,k,w5,21
|
|
+ round_48_63 a,b,c,d,k,k1,w12,6
|
|
+ round_48_63 d,a,b,c,k1,k,w3,10
|
|
+ round_48_63 c,d,a,b,k,k1,w10,15
|
|
+ round_48_63 b,c,d,a,k1,k,w1,21
|
|
+ round_48_63 a,b,c,d,k,k1,w8,6
|
|
+ round_48_63 d,a,b,c,k1,k,w15,10
|
|
+ round_48_63 c,d,a,b,k,k1,w6,15
|
|
+ round_48_63 b,c,d,a,k1,k,w13,21
|
|
+ round_48_63 a,b,c,d,k,k1,w4,6
|
|
+ round_48_63 d,a,b,c,k1,k,w11,10
|
|
+ round_48_63 c,d,a,b,k,k1,w2,15
|
|
+ round_48_63 b,c,d,a,k1, ,w9,21
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ cmp lane0,end
|
|
+ add v_a.4s,v_a1.4s,v_a.4s
|
|
+ add v_b.4s,v_b1.4s,v_b.4s
|
|
+ add v_c.4s,v_c1.4s,v_c.4s
|
|
+ add v_d.4s,v_d1.4s,v_d.4s
|
|
+ bne .loop_start
|
|
+
|
|
+ st4 {v_a.s-v_d.s}[0],[job0]
|
|
+ st4 {v_a.s-v_d.s}[1],[job1]
|
|
+ st4 {v_a.s-v_d.s}[2],[job2]
|
|
+ st4 {v_a.s-v_d.s}[3],[job3]
|
|
+.exit:
|
|
+ ldp d8,d9,[sp,16]
|
|
+ ldp d10,d11,[sp,32]
|
|
+ ldp x29,x30,[sp],48
|
|
+ ret
|
|
+.key_consts:
|
|
+ .word 0xd76aa478
|
|
+ .word 0xd76aa478
|
|
+ .word 0xd76aa478
|
|
+ .word 0xd76aa478
|
|
+ .word 0xe8c7b756
|
|
+ .word 0xe8c7b756
|
|
+ .word 0xe8c7b756
|
|
+ .word 0xe8c7b756
|
|
+ .word 0x242070db
|
|
+ .word 0x242070db
|
|
+ .word 0x242070db
|
|
+ .word 0x242070db
|
|
+ .word 0xc1bdceee
|
|
+ .word 0xc1bdceee
|
|
+ .word 0xc1bdceee
|
|
+ .word 0xc1bdceee
|
|
+ .word 0xf57c0faf
|
|
+ .word 0xf57c0faf
|
|
+ .word 0xf57c0faf
|
|
+ .word 0xf57c0faf
|
|
+ .word 0x4787c62a
|
|
+ .word 0x4787c62a
|
|
+ .word 0x4787c62a
|
|
+ .word 0x4787c62a
|
|
+ .word 0xa8304613
|
|
+ .word 0xa8304613
|
|
+ .word 0xa8304613
|
|
+ .word 0xa8304613
|
|
+ .word 0xfd469501
|
|
+ .word 0xfd469501
|
|
+ .word 0xfd469501
|
|
+ .word 0xfd469501
|
|
+ .word 0x698098d8
|
|
+ .word 0x698098d8
|
|
+ .word 0x698098d8
|
|
+ .word 0x698098d8
|
|
+ .word 0x8b44f7af
|
|
+ .word 0x8b44f7af
|
|
+ .word 0x8b44f7af
|
|
+ .word 0x8b44f7af
|
|
+ .word 0xffff5bb1
|
|
+ .word 0xffff5bb1
|
|
+ .word 0xffff5bb1
|
|
+ .word 0xffff5bb1
|
|
+ .word 0x895cd7be
|
|
+ .word 0x895cd7be
|
|
+ .word 0x895cd7be
|
|
+ .word 0x895cd7be
|
|
+ .word 0x6b901122
|
|
+ .word 0x6b901122
|
|
+ .word 0x6b901122
|
|
+ .word 0x6b901122
|
|
+ .word 0xfd987193
|
|
+ .word 0xfd987193
|
|
+ .word 0xfd987193
|
|
+ .word 0xfd987193
|
|
+ .word 0xa679438e
|
|
+ .word 0xa679438e
|
|
+ .word 0xa679438e
|
|
+ .word 0xa679438e
|
|
+ .word 0x49b40821
|
|
+ .word 0x49b40821
|
|
+ .word 0x49b40821
|
|
+ .word 0x49b40821
|
|
+ .word 0xf61e2562
|
|
+ .word 0xf61e2562
|
|
+ .word 0xf61e2562
|
|
+ .word 0xf61e2562
|
|
+ .word 0xc040b340
|
|
+ .word 0xc040b340
|
|
+ .word 0xc040b340
|
|
+ .word 0xc040b340
|
|
+ .word 0x265e5a51
|
|
+ .word 0x265e5a51
|
|
+ .word 0x265e5a51
|
|
+ .word 0x265e5a51
|
|
+ .word 0xe9b6c7aa
|
|
+ .word 0xe9b6c7aa
|
|
+ .word 0xe9b6c7aa
|
|
+ .word 0xe9b6c7aa
|
|
+ .word 0xd62f105d
|
|
+ .word 0xd62f105d
|
|
+ .word 0xd62f105d
|
|
+ .word 0xd62f105d
|
|
+ .word 0x02441453
|
|
+ .word 0x02441453
|
|
+ .word 0x02441453
|
|
+ .word 0x02441453
|
|
+ .word 0xd8a1e681
|
|
+ .word 0xd8a1e681
|
|
+ .word 0xd8a1e681
|
|
+ .word 0xd8a1e681
|
|
+ .word 0xe7d3fbc8
|
|
+ .word 0xe7d3fbc8
|
|
+ .word 0xe7d3fbc8
|
|
+ .word 0xe7d3fbc8
|
|
+ .word 0x21e1cde6
|
|
+ .word 0x21e1cde6
|
|
+ .word 0x21e1cde6
|
|
+ .word 0x21e1cde6
|
|
+ .word 0xc33707d6
|
|
+ .word 0xc33707d6
|
|
+ .word 0xc33707d6
|
|
+ .word 0xc33707d6
|
|
+ .word 0xf4d50d87
|
|
+ .word 0xf4d50d87
|
|
+ .word 0xf4d50d87
|
|
+ .word 0xf4d50d87
|
|
+ .word 0x455a14ed
|
|
+ .word 0x455a14ed
|
|
+ .word 0x455a14ed
|
|
+ .word 0x455a14ed
|
|
+ .word 0xa9e3e905
|
|
+ .word 0xa9e3e905
|
|
+ .word 0xa9e3e905
|
|
+ .word 0xa9e3e905
|
|
+ .word 0xfcefa3f8
|
|
+ .word 0xfcefa3f8
|
|
+ .word 0xfcefa3f8
|
|
+ .word 0xfcefa3f8
|
|
+ .word 0x676f02d9
|
|
+ .word 0x676f02d9
|
|
+ .word 0x676f02d9
|
|
+ .word 0x676f02d9
|
|
+ .word 0x8d2a4c8a
|
|
+ .word 0x8d2a4c8a
|
|
+ .word 0x8d2a4c8a
|
|
+ .word 0x8d2a4c8a
|
|
+ .word 0xfffa3942
|
|
+ .word 0xfffa3942
|
|
+ .word 0xfffa3942
|
|
+ .word 0xfffa3942
|
|
+ .word 0x8771f681
|
|
+ .word 0x8771f681
|
|
+ .word 0x8771f681
|
|
+ .word 0x8771f681
|
|
+ .word 0x6d9d6122
|
|
+ .word 0x6d9d6122
|
|
+ .word 0x6d9d6122
|
|
+ .word 0x6d9d6122
|
|
+ .word 0xfde5380c
|
|
+ .word 0xfde5380c
|
|
+ .word 0xfde5380c
|
|
+ .word 0xfde5380c
|
|
+ .word 0xa4beea44
|
|
+ .word 0xa4beea44
|
|
+ .word 0xa4beea44
|
|
+ .word 0xa4beea44
|
|
+ .word 0x4bdecfa9
|
|
+ .word 0x4bdecfa9
|
|
+ .word 0x4bdecfa9
|
|
+ .word 0x4bdecfa9
|
|
+ .word 0xf6bb4b60
|
|
+ .word 0xf6bb4b60
|
|
+ .word 0xf6bb4b60
|
|
+ .word 0xf6bb4b60
|
|
+ .word 0xbebfbc70
|
|
+ .word 0xbebfbc70
|
|
+ .word 0xbebfbc70
|
|
+ .word 0xbebfbc70
|
|
+ .word 0x289b7ec6
|
|
+ .word 0x289b7ec6
|
|
+ .word 0x289b7ec6
|
|
+ .word 0x289b7ec6
|
|
+ .word 0xeaa127fa
|
|
+ .word 0xeaa127fa
|
|
+ .word 0xeaa127fa
|
|
+ .word 0xeaa127fa
|
|
+ .word 0xd4ef3085
|
|
+ .word 0xd4ef3085
|
|
+ .word 0xd4ef3085
|
|
+ .word 0xd4ef3085
|
|
+ .word 0x04881d05
|
|
+ .word 0x04881d05
|
|
+ .word 0x04881d05
|
|
+ .word 0x04881d05
|
|
+ .word 0xd9d4d039
|
|
+ .word 0xd9d4d039
|
|
+ .word 0xd9d4d039
|
|
+ .word 0xd9d4d039
|
|
+ .word 0xe6db99e5
|
|
+ .word 0xe6db99e5
|
|
+ .word 0xe6db99e5
|
|
+ .word 0xe6db99e5
|
|
+ .word 0x1fa27cf8
|
|
+ .word 0x1fa27cf8
|
|
+ .word 0x1fa27cf8
|
|
+ .word 0x1fa27cf8
|
|
+ .word 0xc4ac5665
|
|
+ .word 0xc4ac5665
|
|
+ .word 0xc4ac5665
|
|
+ .word 0xc4ac5665
|
|
+ .word 0xf4292244
|
|
+ .word 0xf4292244
|
|
+ .word 0xf4292244
|
|
+ .word 0xf4292244
|
|
+ .word 0x432aff97
|
|
+ .word 0x432aff97
|
|
+ .word 0x432aff97
|
|
+ .word 0x432aff97
|
|
+ .word 0xab9423a7
|
|
+ .word 0xab9423a7
|
|
+ .word 0xab9423a7
|
|
+ .word 0xab9423a7
|
|
+ .word 0xfc93a039
|
|
+ .word 0xfc93a039
|
|
+ .word 0xfc93a039
|
|
+ .word 0xfc93a039
|
|
+ .word 0x655b59c3
|
|
+ .word 0x655b59c3
|
|
+ .word 0x655b59c3
|
|
+ .word 0x655b59c3
|
|
+ .word 0x8f0ccc92
|
|
+ .word 0x8f0ccc92
|
|
+ .word 0x8f0ccc92
|
|
+ .word 0x8f0ccc92
|
|
+ .word 0xffeff47d
|
|
+ .word 0xffeff47d
|
|
+ .word 0xffeff47d
|
|
+ .word 0xffeff47d
|
|
+ .word 0x85845dd1
|
|
+ .word 0x85845dd1
|
|
+ .word 0x85845dd1
|
|
+ .word 0x85845dd1
|
|
+ .word 0x6fa87e4f
|
|
+ .word 0x6fa87e4f
|
|
+ .word 0x6fa87e4f
|
|
+ .word 0x6fa87e4f
|
|
+ .word 0xfe2ce6e0
|
|
+ .word 0xfe2ce6e0
|
|
+ .word 0xfe2ce6e0
|
|
+ .word 0xfe2ce6e0
|
|
+ .word 0xa3014314
|
|
+ .word 0xa3014314
|
|
+ .word 0xa3014314
|
|
+ .word 0xa3014314
|
|
+ .word 0x4e0811a1
|
|
+ .word 0x4e0811a1
|
|
+ .word 0x4e0811a1
|
|
+ .word 0x4e0811a1
|
|
+ .word 0xf7537e82
|
|
+ .word 0xf7537e82
|
|
+ .word 0xf7537e82
|
|
+ .word 0xf7537e82
|
|
+ .word 0xbd3af235
|
|
+ .word 0xbd3af235
|
|
+ .word 0xbd3af235
|
|
+ .word 0xbd3af235
|
|
+ .word 0x2ad7d2bb
|
|
+ .word 0x2ad7d2bb
|
|
+ .word 0x2ad7d2bb
|
|
+ .word 0x2ad7d2bb
|
|
+ .word 0xeb86d391
|
|
+ .word 0xeb86d391
|
|
+ .word 0xeb86d391
|
|
+ .word 0xeb86d391
|
|
+ .size md5_mb_asimd_x4, .-md5_mb_asimd_x4
|
|
diff --git a/drv/hash_mb/md5_mb_sve.S b/drv/hash_mb/md5_mb_sve.S
|
|
new file mode 100644
|
|
index 0000000..8d8ecc1
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/md5_mb_sve.S
|
|
@@ -0,0 +1,158 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2022 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+
|
|
+ .arch armv8.2-a+sve
|
|
+
|
|
+// copying data from sparse memory unto continuous stack space
|
|
+// in oroder to gather-load into SVE registers
|
|
+.macro copy_mb_16words vecs:req,dest:req
|
|
+ mov src,\vecs
|
|
+ mov dst,\dest
|
|
+ mov counter,total_lanes
|
|
+10:
|
|
+ ldr tmp,[src],8
|
|
+ ldr tmp,[tmp]
|
|
+ add tmp,tmp,block_ctr,lsl 6
|
|
+ ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp]
|
|
+ st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64
|
|
+ subs counter,counter,1
|
|
+ b.ne 10b
|
|
+.endm
|
|
+
|
|
+.macro load_init
|
|
+ mov tmpw,16
|
|
+ index VOFFS.s,0,tmpw
|
|
+ copy_mb_16words job_vec,databuf
|
|
+.endm
|
|
+
|
|
+.macro load_word pipelines:req,windex:req,zreg0:req,zreg1
|
|
+ add tmp,databuf,\windex * 4
|
|
+ ld1w { \zreg0\().s}, p0/z, [tmp, VOFFS.s, UXTW 2]
|
|
+ .if \pipelines > 1
|
|
+ add tmp,tmp,veclen,lsl #6
|
|
+ ld1w {\zreg1\().s}, p1/z, [tmp, VOFFS.s, UXTW 2]
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+#include "md5_sve_common.S"
|
|
+
|
|
+/* int md5_mb_sve_max_lanes()
|
|
+ */
|
|
+ .global md5_mb_sve_max_lanes
|
|
+ .type md5_mb_sve_max_lanes, %function
|
|
+md5_mb_sve_max_lanes:
|
|
+ cntw x0
|
|
+ add x0,x0,x0
|
|
+ ret
|
|
+ .size md5_mb_sve_max_lanes, .-md5_mb_sve_max_lanes
|
|
+
|
|
+/*
|
|
+ * void md5_mb_sve(int blocks, int total_lanes, MD5_JOB **job_vec)
|
|
+ */
|
|
+ num_blocks .req w0
|
|
+ total_lanes .req w1
|
|
+ job_vec .req x2
|
|
+ src .req x5
|
|
+ dst .req x6
|
|
+ tmp .req x8
|
|
+ tmpw .req w8
|
|
+ block_ctr .req x9
|
|
+ block_ctr_w .req w9
|
|
+ savedsp .req x10
|
|
+ databuf .req x11
|
|
+ counter .req w12
|
|
+ veclen .req x13
|
|
+ veclen_w .req w13
|
|
+ abcd_buf .req x14
|
|
+ md5key_adr .req x15
|
|
+
|
|
+ .global md5_mb_sve
|
|
+ .type md5_mb_sve, %function
|
|
+md5_mb_sve:
|
|
+ cbz num_blocks,.return
|
|
+ md5_sve_save_stack
|
|
+ mov savedsp,sp
|
|
+ // reserve (16 * lanes) for abcd buf
|
|
+ mov tmpw,total_lanes,lsl 4
|
|
+ sub abcd_buf,sp,tmp
|
|
+ // reserve (64 * lanes) for data buf
|
|
+ mov tmpw,total_lanes,lsl 6
|
|
+ sub databuf,abcd_buf,tmp
|
|
+ mov sp,databuf
|
|
+ adr md5key_adr,MD5_CONST_KEYS
|
|
+ whilelo p0.s,wzr,total_lanes
|
|
+ mov src,job_vec
|
|
+ mov dst,abcd_buf
|
|
+ mov counter,total_lanes
|
|
+.ldr_hash:
|
|
+ ldr tmp,[src],8
|
|
+ add tmp,tmp,64
|
|
+ ld1 {v0.16b},[tmp]
|
|
+ st1 {v0.16b},[dst],16
|
|
+ subs counter,counter,1
|
|
+ bne .ldr_hash
|
|
+ ld4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0/z,[abcd_buf]
|
|
+ mov block_ctr,0
|
|
+ cntp veclen,p0,p0.s
|
|
+ cmp veclen_w,total_lanes
|
|
+ b.eq .loop_1x
|
|
+ whilelo p1.s,veclen_w,total_lanes
|
|
+ add tmp,abcd_buf,veclen,lsl #4
|
|
+ ld4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1/z,[tmp]
|
|
+ b .loop_2x
|
|
+.loop_1x:
|
|
+ md5_single 1
|
|
+ add block_ctr, block_ctr, 1
|
|
+ cmp block_ctr_w,num_blocks
|
|
+ bne .loop_1x
|
|
+ st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf]
|
|
+ b 1f
|
|
+.loop_2x:
|
|
+ md5_single 2
|
|
+ add block_ctr, block_ctr, 1
|
|
+ cmp block_ctr_w,num_blocks
|
|
+ bne .loop_2x
|
|
+ st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf]
|
|
+ add tmp,abcd_buf,veclen,lsl #4
|
|
+ st4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1,[tmp]
|
|
+1:
|
|
+ mov dst,job_vec
|
|
+ mov src,abcd_buf
|
|
+.str_hash:
|
|
+ ld1 {v0.16b},[src],16
|
|
+ ldr tmp,[dst],8
|
|
+ add tmp,tmp,64
|
|
+ st1 {v0.16b},[tmp]
|
|
+ subs total_lanes,total_lanes,1
|
|
+ bne .str_hash
|
|
+ mov sp,savedsp
|
|
+ md5_sve_restore_stack
|
|
+.return:
|
|
+ ret
|
|
+ .size md5_mb_sve, .-md5_mb_sve
|
|
diff --git a/drv/hash_mb/md5_sve_common.S b/drv/hash_mb/md5_sve_common.S
|
|
new file mode 100644
|
|
index 0000000..ed81482
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/md5_sve_common.S
|
|
@@ -0,0 +1,478 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2022 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ VK .req z0
|
|
+ VOFFS .req z1
|
|
+ VA_0 .req z2
|
|
+ VB_0 .req z3
|
|
+ VC_0 .req z4
|
|
+ VD_0 .req z5
|
|
+ VF_0 .req z6
|
|
+ VF_1 .req z7
|
|
+ VA_1 .req z16
|
|
+ VB_1 .req z17
|
|
+ VC_1 .req z18
|
|
+ VD_1 .req z19
|
|
+ MD5WORD0_0 .req z20
|
|
+ MD5WORD1_0 .req z21
|
|
+ MD5WORD0_1 .req z22
|
|
+ MD5WORD1_1 .req z23
|
|
+ TMPV0 .req v20
|
|
+ TMPV1 .req v21
|
|
+ TMPV2 .req v22
|
|
+ TMPV3 .req v23
|
|
+ VTMP_0 .req z24
|
|
+ VAA_0 .req z25
|
|
+ VBB_0 .req z26
|
|
+ VCC_0 .req z27
|
|
+ VDD_0 .req z28
|
|
+ VTMP_1 .req z29
|
|
+ VAA_1 .req z30
|
|
+ VBB_1 .req z31
|
|
+ VCC_1 .req z8
|
|
+ VDD_1 .req z9
|
|
+ TT .req z0
|
|
+
|
|
+.macro rotate_left_x1 out:req,in:req,tmp:req,bits
|
|
+ .if \bits == 16
|
|
+ revh \out\().s,p0/m,\in\().s
|
|
+ .else
|
|
+ .if have_sve2 == 0
|
|
+ lsl \tmp\().s, \in\().s,\bits
|
|
+ lsr \out\().s,\in\().s,32-\bits
|
|
+ orr \out\().d,\out\().d,\tmp\().d
|
|
+ .else
|
|
+ movprfx \out\().d,\in\().d
|
|
+ xar \out\().s,\out\().s,VZERO.s,32-\bits
|
|
+ .endif
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro rotate_left_x2 out:req,in:req,tmp:req,bits,out1:req,in1:req,tmp1:req,bits1
|
|
+
|
|
+ .if \bits == 16
|
|
+ revh \out\().s,p0/m,\in\().s
|
|
+ revh \out1\().s,p0/m,\in1\().s
|
|
+ .else
|
|
+ .if have_sve2 == 0
|
|
+ lsl \tmp\().s, \in\().s,\bits
|
|
+ lsl \tmp1\().s, \in1\().s,\bits1
|
|
+ lsr \out\().s,\in\().s,32-\bits
|
|
+ lsr \out1\().s,\in1\().s,32-\bits1
|
|
+ orr \out\().d,\out\().d,\tmp\().d
|
|
+ orr \out1\().d,\out1\().d,\tmp1\().d
|
|
+ .else
|
|
+ movprfx \out\().d,\in\().d
|
|
+ xar \out\().s,\out\().s,VZERO.s,32-\bits
|
|
+ movprfx \out1\().d,\in1\().d
|
|
+ xar \out1\().s,\out1\().s,VZERO.s,32-\bits1
|
|
+ .endif
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro bsl_x1 ret:req,x:req,y:req,z:req,tmp:req
|
|
+ .if have_sve2 == 0
|
|
+ bic \ret\().d,\z\().d,\x\().d
|
|
+ and \tmp\().d,\x\().d,\y\().d
|
|
+ orr \ret\().d,\ret\().d,\tmp\().d
|
|
+ .else
|
|
+ movprfx \ret\().d,\x\().d
|
|
+ bsl \ret\().d,\ret\().d,\y\().d,\z\().d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro bsl_x2 ret:req,x:req,y:req,z:req,tmp:req,ret1:req,x1:req,y1:req,z1:req,tmp1:req
|
|
+ .if have_sve2 == 0
|
|
+ bic \ret\().d,\z\().d,\x\().d
|
|
+ bic \ret1\().d,\z1\().d,\x1\().d
|
|
+ and \tmp\().d,\x\().d,\y\().d
|
|
+ and \tmp1\().d,\x1\().d,\y1\().d
|
|
+ orr \ret\().d,\ret\().d,\tmp\().d
|
|
+ orr \ret1\().d,\ret1\().d,\tmp1\().d
|
|
+ .else
|
|
+ movprfx \ret\().d,\x\().d
|
|
+ bsl \ret\().d,\ret\().d,\y\().d,\z\().d
|
|
+ movprfx \ret1\().d,\x1\().d
|
|
+ bsl \ret1\().d,\ret1\().d,\y1\().d,\z1\().d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+
|
|
+// F = D ^ (B and (C xor D))
|
|
+// that is (B and C) or ((not B) and D)
|
|
+.macro FUNC_F0_x1
|
|
+ bsl_x1 VF_0,VB_0,VC_0,VD_0,VTMP_0
|
|
+.endm
|
|
+
|
|
+.macro FUNC_F0_x2
|
|
+ bsl_x2 VF_0,VB_0,VC_0,VD_0,VTMP_0,VF_1,VB_1,VC_1,VD_1,VTMP_1
|
|
+.endm
|
|
+
|
|
+// F = C xor (D and (B xor C))
|
|
+// that is (D and B) or ((not D) and C)
|
|
+.macro FUNC_F1_x1
|
|
+ bsl_x1 VF_0,VD_0,VB_0,VC_0,VTMP_0
|
|
+.endm
|
|
+
|
|
+.macro FUNC_F1_x2
|
|
+ bsl_x2 VF_0,VD_0,VB_0,VC_0,VTMP_0,VF_1,VD_1,VB_1,VC_1,VTMP_1
|
|
+.endm
|
|
+
|
|
+// F := B xor C xor D
|
|
+.macro FUNC_F2_x1
|
|
+ .if have_sve2 == 0
|
|
+ eor VF_0.d,VB_0.d,VC_0.d
|
|
+ eor VF_0.d,VF_0.d,VD_0.d
|
|
+ .else
|
|
+ movprfx VF_0.d,VB_0.d
|
|
+ eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro FUNC_F2_x2
|
|
+ .if have_sve2 == 0
|
|
+ eor VF_0.d,VB_0.d,VC_0.d
|
|
+ eor VF_1.d,VB_1.d,VC_1.d
|
|
+ eor VF_0.d,VF_0.d,VD_0.d
|
|
+ eor VF_1.d,VF_1.d,VD_1.d
|
|
+ .else
|
|
+ movprfx VF_0.d,VB_0.d
|
|
+ eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d
|
|
+ movprfx VF_1.d,VB_1.d
|
|
+ eor3 VF_1.d,VF_1.d,VC_1.d,VD_1.d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+// F := C xor (B or (not D))
|
|
+.macro FUNC_F3_x1
|
|
+ not VF_0.s,p0/m,VD_0.s
|
|
+ orr VF_0.d,VF_0.d,VB_0.d
|
|
+ eor VF_0.d,VF_0.d,VC_0.d
|
|
+.endm
|
|
+
|
|
+.macro FUNC_F3_x2
|
|
+ not VF_0.s,p0/m,VD_0.s
|
|
+ not VF_1.s,p0/m,VD_1.s
|
|
+ orr VF_0.d,VF_0.d,VB_0.d
|
|
+ orr VF_1.d,VF_1.d,VB_1.d
|
|
+ eor VF_0.d,VF_0.d,VC_0.d
|
|
+ eor VF_1.d,VF_1.d,VC_1.d
|
|
+.endm
|
|
+
|
|
+.macro SWAP_STATES
|
|
+ .unreq TT
|
|
+ TT .req VA_0
|
|
+ .unreq VA_0
|
|
+ VA_0 .req VD_0
|
|
+ .unreq VD_0
|
|
+ VD_0 .req VC_0
|
|
+ .unreq VC_0
|
|
+ VC_0 .req VB_0
|
|
+ .unreq VB_0
|
|
+ VB_0 .req TT
|
|
+
|
|
+ .unreq TT
|
|
+ TT .req VA_1
|
|
+ .unreq VA_1
|
|
+ VA_1 .req VD_1
|
|
+ .unreq VD_1
|
|
+ VD_1 .req VC_1
|
|
+ .unreq VC_1
|
|
+ VC_1 .req VB_1
|
|
+ .unreq VB_1
|
|
+ VB_1 .req TT
|
|
+.endm
|
|
+
|
|
+.macro MD5_STEP_x1 windex:req,mg:req,func_f:req,bits:req
|
|
+ ld1rw {VK.s},p0/z,[md5key_adr,windex * 4]
|
|
+ \func_f\()_x1
|
|
+ add VTMP_0.s,VA_0.s,\mg\()_0.s
|
|
+ add VF_0.s,VF_0.s,VK.s
|
|
+ add VF_0.s,VF_0.s,VTMP_0.s
|
|
+ rotate_left_x1 VA_0,VF_0,VTMP_0,\bits
|
|
+ add VA_0.s,VA_0.s,VB_0.s
|
|
+.endm
|
|
+
|
|
+.macro MD5_STEP_x2 windex:req,mg:req,func_f:req,bits:req
|
|
+ ld1rw {VK.s},p0/z,[md5key_adr,windex * 4]
|
|
+ \func_f\()_x2
|
|
+ add VTMP_0.s,VA_0.s,\mg\()_0.s
|
|
+ add VTMP_1.s,VA_1.s,\mg\()_1.s
|
|
+ add VF_0.s,VF_0.s,VK.s
|
|
+ add VF_1.s,VF_1.s,VK.s
|
|
+ add VF_0.s,VF_0.s,VTMP_0.s
|
|
+ add VF_1.s,VF_1.s,VTMP_1.s
|
|
+ rotate_left_x2 VA_0,VF_0,VTMP_0,\bits,VA_1,VF_1,VTMP_1,\bits
|
|
+ add VA_0.s,VA_0.s,VB_0.s
|
|
+ add VA_1.s,VA_1.s,VB_1.s
|
|
+.endm
|
|
+
|
|
+.altmacro
|
|
+.macro load_words index:req,mg:req
|
|
+ load_word %num_pipelines,\index,MD5WORD\mg\()_0,MD5WORD\mg\()_1
|
|
+.endm
|
|
+
|
|
+.macro MD5_STEP_WRAPPER pipelines:req,windex:req,gindex:req,mg:req,\
|
|
+ func_f:req,bits:req,gindex_next,mg_next
|
|
+ .ifnb \gindex_next
|
|
+ load_words \gindex_next,\mg_next
|
|
+ .endif
|
|
+ MD5_STEP_x\pipelines\() \windex,MD5WORD\mg\(),\func_f,\bits
|
|
+.endm
|
|
+
|
|
+.macro exec_step windex:req,gindex:req,bits:req,gindex_next
|
|
+ .if \windex % 2 == 0
|
|
+ mg=0
|
|
+ mg_next=1
|
|
+ .else
|
|
+ mg=1
|
|
+ mg_next=0
|
|
+ .endif
|
|
+
|
|
+ .if \windex <= 15
|
|
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
|
|
+ FUNC_F0,\bits,\gindex_next,%mg_next
|
|
+ .endif
|
|
+ .if \windex >= 16 && \windex <= 31
|
|
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
|
|
+ FUNC_F1,\bits,\gindex_next,%mg_next
|
|
+ .endif
|
|
+ .if \windex >= 32 && \windex <= 47
|
|
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
|
|
+ FUNC_F2,\bits,\gindex_next,%mg_next
|
|
+ .endif
|
|
+ .if \windex >= 48 && \windex < 63
|
|
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
|
|
+ FUNC_F3,\bits,\gindex_next,%mg_next
|
|
+ .endif
|
|
+ .if \windex == 63
|
|
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,FUNC_F3,\bits
|
|
+ .endif
|
|
+ SWAP_STATES
|
|
+.endm
|
|
+
|
|
+.macro exec_steps
|
|
+ exec_step 0,0,7,1
|
|
+ exec_step 1,1,12,2
|
|
+ exec_step 2,2,17,3
|
|
+ exec_step 3,3,22,4
|
|
+ exec_step 4,4,7,5
|
|
+ exec_step 5,5,12,6
|
|
+ exec_step 6,6,17,7
|
|
+ exec_step 7,7,22,8
|
|
+ exec_step 8,8,7,9
|
|
+ exec_step 9,9,12,10
|
|
+ exec_step 10,10,17,11
|
|
+ exec_step 11,11,22,12
|
|
+ exec_step 12,12,7,13
|
|
+ exec_step 13,13,12,14
|
|
+ exec_step 14,14,17,15
|
|
+ exec_step 15,15,22,1
|
|
+ exec_step 16,1,5,6
|
|
+ exec_step 17,6,9,11
|
|
+ exec_step 18,11,14,0
|
|
+ exec_step 19,0,20,5
|
|
+ exec_step 20,5,5,10
|
|
+ exec_step 21,10,9,15
|
|
+ exec_step 22,15,14,4
|
|
+ exec_step 23,4,20,9
|
|
+ exec_step 24,9,5,14
|
|
+ exec_step 25,14,9,3
|
|
+ exec_step 26,3,14,8
|
|
+ exec_step 27,8,20,13
|
|
+ exec_step 28,13,5,2
|
|
+ exec_step 29,2,9,7
|
|
+ exec_step 30,7,14,12
|
|
+ exec_step 31,12,20,5
|
|
+ exec_step 32,5,4,8
|
|
+ exec_step 33,8,11,11
|
|
+ exec_step 34,11,16,14
|
|
+ exec_step 35,14,23,1
|
|
+ exec_step 36,1,4,4
|
|
+ exec_step 37,4,11,7
|
|
+ exec_step 38,7,16,10
|
|
+ exec_step 39,10,23,13
|
|
+ exec_step 40,13,4,0
|
|
+ exec_step 41,0,11,3
|
|
+ exec_step 42,3,16,6
|
|
+ exec_step 43,6,23,9
|
|
+ exec_step 44,9,4,12
|
|
+ exec_step 45,12,11,15
|
|
+ exec_step 46,15,16,2
|
|
+ exec_step 47,2,23,0
|
|
+ exec_step 48,0,6,7
|
|
+ exec_step 49,7,10,14
|
|
+ exec_step 50,14,15,5
|
|
+ exec_step 51,5,21,12
|
|
+ exec_step 52,12,6,3
|
|
+ exec_step 53,3,10,10
|
|
+ exec_step 54,10,15,1
|
|
+ exec_step 55,1,21,8
|
|
+ exec_step 56,8,6,15
|
|
+ exec_step 57,15,10,6
|
|
+ exec_step 58,6,15,13
|
|
+ exec_step 59,13,21,4
|
|
+ exec_step 60,4,6,11
|
|
+ exec_step 61,11,10,2
|
|
+ exec_step 62,2,15,9
|
|
+ exec_step 63,9,21
|
|
+.endm
|
|
+
|
|
+.macro prepare_x1
|
|
+ load_words 0,0
|
|
+ orr VAA_0.d,VA_0.d,VA_0.d
|
|
+ orr VBB_0.d,VB_0.d,VB_0.d
|
|
+ orr VCC_0.d,VC_0.d,VC_0.d
|
|
+ orr VDD_0.d,VD_0.d,VD_0.d
|
|
+.endm
|
|
+
|
|
+.macro prepare_x2
|
|
+ load_words 0,0
|
|
+ orr VAA_0.d,VA_0.d,VA_0.d
|
|
+ orr VAA_1.d,VA_1.d,VA_1.d
|
|
+ orr VBB_0.d,VB_0.d,VB_0.d
|
|
+ orr VBB_1.d,VB_1.d,VB_1.d
|
|
+ orr VCC_0.d,VC_0.d,VC_0.d
|
|
+ orr VCC_1.d,VC_1.d,VC_1.d
|
|
+ orr VDD_0.d,VD_0.d,VD_0.d
|
|
+ orr VDD_1.d,VD_1.d,VD_1.d
|
|
+.endm
|
|
+
|
|
+.macro finish_x1
|
|
+ add VA_0.s,VA_0.s,VAA_0.s
|
|
+ add VB_0.s,VB_0.s,VBB_0.s
|
|
+ add VC_0.s,VC_0.s,VCC_0.s
|
|
+ add VD_0.s,VD_0.s,VDD_0.s
|
|
+.endm
|
|
+
|
|
+.macro finish_x2
|
|
+ add VA_0.s,VA_0.s,VAA_0.s
|
|
+ add VA_1.s,VA_1.s,VAA_1.s
|
|
+ add VB_0.s,VB_0.s,VBB_0.s
|
|
+ add VB_1.s,VB_1.s,VBB_1.s
|
|
+ add VC_0.s,VC_0.s,VCC_0.s
|
|
+ add VC_1.s,VC_1.s,VCC_1.s
|
|
+ add VD_0.s,VD_0.s,VDD_0.s
|
|
+ add VD_1.s,VD_1.s,VDD_1.s
|
|
+.endm
|
|
+
|
|
+.macro md5_single pipelines:req,sve2
|
|
+ .ifnb \sve2
|
|
+ have_sve2=1
|
|
+ eor VZERO.d,VZERO.d,VZERO.d
|
|
+ .else
|
|
+ have_sve2=0
|
|
+ .endif
|
|
+ num_pipelines=\pipelines
|
|
+ load_init
|
|
+
|
|
+ prepare_x\pipelines\()
|
|
+ exec_steps
|
|
+ finish_x\pipelines\()
|
|
+.endm
|
|
+
|
|
+.macro md5_sve_save_stack
|
|
+ stp d8,d9,[sp, -48]!
|
|
+ stp d10,d11,[sp, 16]
|
|
+ stp d12,d13,[sp, 32]
|
|
+.endm
|
|
+
|
|
+.macro md5_sve_restore_stack
|
|
+ ldp d10,d11,[sp, 16]
|
|
+ ldp d12,d13,[sp, 32]
|
|
+ ldp d8,d9,[sp],48
|
|
+.endm
|
|
+
|
|
+ .section .rodata.cst16,"aM",@progbits,16
|
|
+ .align 16
|
|
+
|
|
+MD5_CONST_KEYS:
|
|
+ .word 0xd76aa478
|
|
+ .word 0xe8c7b756
|
|
+ .word 0x242070db
|
|
+ .word 0xc1bdceee
|
|
+ .word 0xf57c0faf
|
|
+ .word 0x4787c62a
|
|
+ .word 0xa8304613
|
|
+ .word 0xfd469501
|
|
+ .word 0x698098d8
|
|
+ .word 0x8b44f7af
|
|
+ .word 0xffff5bb1
|
|
+ .word 0x895cd7be
|
|
+ .word 0x6b901122
|
|
+ .word 0xfd987193
|
|
+ .word 0xa679438e
|
|
+ .word 0x49b40821
|
|
+ .word 0xf61e2562
|
|
+ .word 0xc040b340
|
|
+ .word 0x265e5a51
|
|
+ .word 0xe9b6c7aa
|
|
+ .word 0xd62f105d
|
|
+ .word 0x02441453
|
|
+ .word 0xd8a1e681
|
|
+ .word 0xe7d3fbc8
|
|
+ .word 0x21e1cde6
|
|
+ .word 0xc33707d6
|
|
+ .word 0xf4d50d87
|
|
+ .word 0x455a14ed
|
|
+ .word 0xa9e3e905
|
|
+ .word 0xfcefa3f8
|
|
+ .word 0x676f02d9
|
|
+ .word 0x8d2a4c8a
|
|
+ .word 0xfffa3942
|
|
+ .word 0x8771f681
|
|
+ .word 0x6d9d6122
|
|
+ .word 0xfde5380c
|
|
+ .word 0xa4beea44
|
|
+ .word 0x4bdecfa9
|
|
+ .word 0xf6bb4b60
|
|
+ .word 0xbebfbc70
|
|
+ .word 0x289b7ec6
|
|
+ .word 0xeaa127fa
|
|
+ .word 0xd4ef3085
|
|
+ .word 0x04881d05
|
|
+ .word 0xd9d4d039
|
|
+ .word 0xe6db99e5
|
|
+ .word 0x1fa27cf8
|
|
+ .word 0xc4ac5665
|
|
+ .word 0xf4292244
|
|
+ .word 0x432aff97
|
|
+ .word 0xab9423a7
|
|
+ .word 0xfc93a039
|
|
+ .word 0x655b59c3
|
|
+ .word 0x8f0ccc92
|
|
+ .word 0xffeff47d
|
|
+ .word 0x85845dd1
|
|
+ .word 0x6fa87e4f
|
|
+ .word 0xfe2ce6e0
|
|
+ .word 0xa3014314
|
|
+ .word 0x4e0811a1
|
|
+ .word 0xf7537e82
|
|
+ .word 0xbd3af235
|
|
+ .word 0x2ad7d2bb
|
|
+ .word 0xeb86d391
|
|
diff --git a/drv/hash_mb/sm3_mb_asimd_x1.S b/drv/hash_mb/sm3_mb_asimd_x1.S
|
|
new file mode 100644
|
|
index 0000000..c7362de
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/sm3_mb_asimd_x1.S
|
|
@@ -0,0 +1,387 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2020 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
|
|
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ .arch armv8.2-a
|
|
+ .text
|
|
+ .align 2
|
|
+ .p2align 3,,7
|
|
+
|
|
+.macro declare_var_vector_reg name:req,reg:req
|
|
+ q\name\() .req q\reg
|
|
+ v\name\() .req v\reg
|
|
+ s\name\() .req s\reg
|
|
+.endm
|
|
+
|
|
+ job .req x0
|
|
+ len .req x1
|
|
+ data .req x2
|
|
+ digest .req x0
|
|
+
|
|
+ msg0 .req w3
|
|
+ msg1 .req w4
|
|
+ msg2 .req w5
|
|
+ msg3 .req w6
|
|
+ msg4 .req w7
|
|
+
|
|
+ msg .req w9
|
|
+ msgP .req w10
|
|
+ SS1 .req w11
|
|
+ SS2 .req w12
|
|
+ TT1 .req w13
|
|
+ TT2 .req w14
|
|
+ Tj .req w15
|
|
+ tmp0 .req w19
|
|
+ tmp1 .req w20
|
|
+ dig_A .req w21
|
|
+ dig_B .req w22
|
|
+ dig_C .req w23
|
|
+ dig_D .req w24
|
|
+ dig_E .req w25
|
|
+ dig_F .req w26
|
|
+ dig_G .req w27
|
|
+ dig_H .req w28
|
|
+
|
|
+ declare_var_vector_reg dig0,0
|
|
+ declare_var_vector_reg dig1,1
|
|
+ declare_var_vector_reg dig0_bak,2
|
|
+ declare_var_vector_reg dig1_bak,3
|
|
+ declare_var_vector_reg vect_msg0,4
|
|
+ declare_var_vector_reg vect_msg1,5
|
|
+ declare_var_vector_reg vect_msg2,6
|
|
+ declare_var_vector_reg vect_msg3,7
|
|
+
|
|
+ declare_var_vector_reg vect_msgP0,16
|
|
+ declare_var_vector_reg vect_msgP1,17
|
|
+ declare_var_vector_reg vect_msgP2,18
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+// round 0-11
|
|
+.macro sm3_round_0 round:req
|
|
+ ldr msg, [sp,msg_off+4*\round\()]
|
|
+ ldr msgP,[sp,wp_off +4*\round\()]
|
|
+ add SS1,dig_E,Tj
|
|
+ ror TT1,dig_A,32-12
|
|
+ add SS1,SS1,TT1
|
|
+ ror SS1,SS1,32-7 //SS1 done
|
|
+ eor SS2,SS1,TT1 //SS2 done
|
|
+ eor TT1,dig_A,dig_B
|
|
+ eor TT2,dig_E,dig_F
|
|
+ add SS2,SS2,msgP
|
|
+ eor TT2,TT2,dig_G
|
|
+ add SS1,SS1,msg
|
|
+ eor TT1,TT1,dig_C
|
|
+ add SS2,SS2,dig_D
|
|
+ add SS1,SS1,dig_H
|
|
+ add TT1,TT1,SS2
|
|
+ add TT2,TT2,SS1
|
|
+ mov dig_D,dig_C
|
|
+ ror dig_C,dig_B,32-9
|
|
+ mov dig_B,dig_A
|
|
+ mov dig_A,TT1
|
|
+ eor TT1,TT2,TT2,ror (32-17)
|
|
+ mov dig_H,dig_G
|
|
+ ror dig_G,dig_F,32-19
|
|
+ mov dig_F,dig_E
|
|
+ eor dig_E,TT1,TT2,ror(32-9)
|
|
+ ror Tj,Tj,(32-1)
|
|
+.endm
|
|
+
|
|
+//round 12-15
|
|
+.macro sm3_round_12 round:req
|
|
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
|
|
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
|
|
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
|
|
+ add SS1,dig_E,Tj
|
|
+ ror TT1,dig_A,32-12
|
|
+ add SS1,SS1,TT1
|
|
+ ror SS1,SS1,32-7 //SS1 done
|
|
+ eor SS2,SS1,TT1 //SS2 done
|
|
+
|
|
+ eor msg0,msg0,msg1
|
|
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
|
|
+ eor TT1,dig_A,dig_B
|
|
+ eor TT2,dig_E,dig_F
|
|
+ add SS2,SS2,dig_D
|
|
+ eor TT2,TT2,dig_G
|
|
+ add SS1,SS1,msg
|
|
+ eor msg0,msg0,msg2,ror (32-15)
|
|
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
|
|
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
|
|
+ eor msg1,msg0,msg0,ror (32 -15)
|
|
+ eor TT1,TT1,dig_C
|
|
+ add TT1,TT1,SS2
|
|
+ eor msg4,msg4,msg3, ror (32-7)
|
|
+ eor msg0,msg1,msg0, ror (32-23)
|
|
+ add SS1,SS1,dig_H
|
|
+ eor msg0,msg0,msg4
|
|
+ add TT2,TT2,SS1
|
|
+ mov dig_D,dig_C
|
|
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
|
|
+ eor msgP,msg,msg0
|
|
+ add TT1,TT1,msgP
|
|
+ ror dig_C,dig_B,32-9
|
|
+ mov dig_B,dig_A
|
|
+ mov dig_A,TT1
|
|
+ eor TT1,TT2,TT2,ror (32-17)
|
|
+ mov dig_H,dig_G
|
|
+ ror dig_G,dig_F,32-19
|
|
+ mov dig_F,dig_E
|
|
+ eor dig_E,TT1,TT2,ror(32-9)
|
|
+ ror Tj,Tj,32-1
|
|
+.endm
|
|
+
|
|
+// round 16-62
|
|
+.macro sm3_round_16 round:req
|
|
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
|
|
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
|
|
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
|
|
+ add SS1,dig_E,Tj
|
|
+ ror TT1,dig_A,32-12
|
|
+ add SS1,SS1,TT1
|
|
+ ror SS1,SS1,32-7 //SS1 done
|
|
+ eor SS2,SS1,TT1 //SS2 done
|
|
+
|
|
+ eor msg0,msg0,msg1
|
|
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
|
|
+ orr TT1,dig_B,dig_C
|
|
+ and tmp0,dig_B,dig_C
|
|
+
|
|
+ eor TT2,dig_F,dig_G
|
|
+ and TT1,TT1,dig_A
|
|
+ add SS2,SS2,dig_D
|
|
+ orr TT1,TT1,tmp0
|
|
+ and TT2,TT2,dig_E
|
|
+ add SS1,SS1,msg
|
|
+ eor TT2,TT2,dig_G
|
|
+
|
|
+ eor msg0,msg0,msg2,ror (32-15)
|
|
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
|
|
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
|
|
+ eor msg1,msg0,msg0,ror (32 -15)
|
|
+ add TT1,TT1,SS2
|
|
+ eor msg4,msg4,msg3, ror (32-7)
|
|
+ eor msg0,msg1,msg0, ror (32-23)
|
|
+ add SS1,SS1,dig_H
|
|
+ eor msg0,msg0,msg4
|
|
+ add TT2,TT2,SS1
|
|
+ mov dig_D,dig_C
|
|
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
|
|
+ eor msgP,msg,msg0
|
|
+ add TT1,TT1,msgP
|
|
+ ror dig_C,dig_B,32-9
|
|
+ mov dig_B,dig_A
|
|
+ mov dig_A,TT1
|
|
+ eor TT1,TT2,TT2,ror (32-17)
|
|
+ mov dig_H,dig_G
|
|
+ ror dig_G,dig_F,32-19
|
|
+ mov dig_F,dig_E
|
|
+ eor dig_E,TT1,TT2,ror(32-9)
|
|
+ ror Tj,Tj,32-1
|
|
+.endm
|
|
+
|
|
+//round 63
|
|
+.macro sm3_round_63 round:req
|
|
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
|
|
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
|
|
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
|
|
+ add SS1,dig_E,Tj
|
|
+ ror TT1,dig_A,32-12
|
|
+ add SS1,SS1,TT1
|
|
+ ror SS1,SS1,32-7 //SS1 done
|
|
+ eor SS2,SS1,TT1 //SS2 done
|
|
+ eor msg0,msg0,msg1
|
|
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
|
|
+ orr TT1,dig_B,dig_C
|
|
+ and tmp0,dig_B,dig_C
|
|
+ eor TT2,dig_F,dig_G
|
|
+ and TT1,TT1,dig_A
|
|
+ add SS2,SS2,dig_D
|
|
+ orr TT1,TT1,tmp0
|
|
+ and TT2,TT2,dig_E
|
|
+ add SS1,SS1,msg
|
|
+ eor TT2,TT2,dig_G
|
|
+ eor msg0,msg0,msg2,ror (32-15)
|
|
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
|
|
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
|
|
+ eor msg1,msg0,msg0,ror (32 -15)
|
|
+ add TT1,TT1,SS2
|
|
+ eor msg4,msg4,msg3, ror (32-7)
|
|
+ eor msg0,msg1,msg0, ror (32-23)
|
|
+ add SS1,SS1,dig_H
|
|
+ eor msg0,msg0,msg4
|
|
+ add TT2,TT2,SS1
|
|
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
|
|
+ eor msgP,msg,msg0
|
|
+ add TT1,TT1,msgP
|
|
+ ins vdig0_bak.s[3],dig_C
|
|
+ ror dig_C,dig_B,32-9
|
|
+ ins vdig0_bak.s[1],dig_A
|
|
+ ins vdig0_bak.s[0],TT1
|
|
+ ins vdig0_bak.s[2],dig_C
|
|
+ eor TT1,TT2,TT2,ror (32-17)
|
|
+ ins vdig1_bak.s[3],dig_G
|
|
+ ror dig_G,dig_F,32-19
|
|
+ ins vdig1_bak.s[1],dig_E
|
|
+ ins vdig1_bak.s[2],dig_G
|
|
+ eor dig_E,TT1,TT2,ror(32-9)
|
|
+ ins vdig1_bak.s[0],dig_E
|
|
+.endm
|
|
+
|
|
+ .set wp_off , 96
|
|
+ .set msg_off, 96 + 12*4
|
|
+#define STACK_SIZE 224
|
|
+ .global sm3_mb_asimd_x1
|
|
+ .type sm3_mb_asimd_x1, %function
|
|
+sm3_mb_asimd_x1:
|
|
+ stp x29,x30, [sp,-STACK_SIZE]!
|
|
+ cmp len,0
|
|
+ ldr data,[job],64
|
|
+ ldp qdig0,qdig1,[digest]
|
|
+ stp x19, x20, [sp, 16]
|
|
+ stp x21, x22, [sp, 32]
|
|
+ rev32 vdig0.16b,vdig0.16b
|
|
+ stp x23, x24, [sp, 48]
|
|
+ rev32 vdig1.16b,vdig1.16b
|
|
+ stp x25, x26, [sp, 64]
|
|
+ stp x27, x28, [sp, 80]
|
|
+ ble .exit_func
|
|
+
|
|
+.start_loop:
|
|
+
|
|
+ /** prepare first 12 round data **/
|
|
+ ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64
|
|
+ mov Tj, 17689
|
|
+ umov dig_A,vdig0.s[0]
|
|
+ movk Tj, 0x79cc, lsl 16
|
|
+ rev32 vvect_msg0.16b,vvect_msg0.16b
|
|
+ umov dig_B,vdig0.s[1]
|
|
+ rev32 vvect_msg1.16b,vvect_msg1.16b
|
|
+ umov dig_C,vdig0.s[2]
|
|
+ rev32 vvect_msg2.16b,vvect_msg2.16b
|
|
+ umov dig_D,vdig0.s[3]
|
|
+ rev32 vvect_msg3.16b,vvect_msg3.16b
|
|
+ umov dig_E,vdig1.s[0]
|
|
+ stp qvect_msg0,qvect_msg1,[sp,msg_off]
|
|
+ umov dig_F,vdig1.s[1]
|
|
+ stp qvect_msg2,qvect_msg3,[sp,msg_off+32]
|
|
+ umov dig_G,vdig1.s[2]
|
|
+ eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b
|
|
+ eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b
|
|
+ umov dig_H,vdig1.s[3]
|
|
+ stp qvect_msgP0,qvect_msgP1,[sp,wp_off]
|
|
+ eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b
|
|
+ str qvect_msgP2,[sp,wp_off+32]
|
|
+
|
|
+ sm3_round_0 0
|
|
+ sm3_round_0 1
|
|
+ sm3_round_0 2
|
|
+ sm3_round_0 3
|
|
+ sm3_round_0 4
|
|
+ sm3_round_0 5
|
|
+ sm3_round_0 6
|
|
+ sm3_round_0 7
|
|
+ sm3_round_0 8
|
|
+ sm3_round_0 9
|
|
+ sm3_round_0 10
|
|
+ sm3_round_0 11
|
|
+
|
|
+ sm3_round_12 12
|
|
+ sm3_round_12 13
|
|
+ sm3_round_12 14
|
|
+ sm3_round_12 15
|
|
+ mov Tj, 0x7a87
|
|
+ movk Tj, 0x9d8a, lsl 16
|
|
+ sm3_round_16 16
|
|
+ sm3_round_16 17
|
|
+ sm3_round_16 18
|
|
+ sm3_round_16 19
|
|
+ sm3_round_16 20
|
|
+ sm3_round_16 21
|
|
+ sm3_round_16 22
|
|
+ sm3_round_16 23
|
|
+ sm3_round_16 24
|
|
+ sm3_round_16 25
|
|
+ sm3_round_16 26
|
|
+ sm3_round_16 27
|
|
+ sm3_round_16 28
|
|
+ sm3_round_16 29
|
|
+ sm3_round_16 30
|
|
+ sm3_round_16 31
|
|
+ sm3_round_16 32
|
|
+ sm3_round_16 33
|
|
+ sm3_round_16 34
|
|
+ sm3_round_16 35
|
|
+ sm3_round_16 36
|
|
+ sm3_round_16 37
|
|
+ sm3_round_16 38
|
|
+ sm3_round_16 39
|
|
+ sm3_round_16 40
|
|
+ sm3_round_16 41
|
|
+ sm3_round_16 42
|
|
+ sm3_round_16 43
|
|
+ sm3_round_16 44
|
|
+ sm3_round_16 45
|
|
+ sm3_round_16 46
|
|
+ sm3_round_16 47
|
|
+ sm3_round_16 48
|
|
+ sm3_round_16 49
|
|
+ sm3_round_16 50
|
|
+ sm3_round_16 51
|
|
+ sm3_round_16 52
|
|
+ sm3_round_16 53
|
|
+ sm3_round_16 54
|
|
+ sm3_round_16 55
|
|
+ sm3_round_16 56
|
|
+ sm3_round_16 57
|
|
+ sm3_round_16 58
|
|
+ sm3_round_16 59
|
|
+ sm3_round_16 60
|
|
+ sm3_round_16 61
|
|
+ sm3_round_16 62
|
|
+ sm3_round_63 63
|
|
+ subs len,len,1
|
|
+ eor vdig0.16b,vdig0.16b,vdig0_bak.16b
|
|
+ eor vdig1.16b,vdig1.16b,vdig1_bak.16b
|
|
+ bne .start_loop
|
|
+.exit_func:
|
|
+ ldp x19, x20, [sp, 16]
|
|
+ rev32 vdig0.16b,vdig0.16b
|
|
+ ldp x21, x22, [sp, 32]
|
|
+ rev32 vdig1.16b,vdig1.16b
|
|
+ ldp x23, x24, [sp, 48]
|
|
+ stp qdig0,qdig1,[digest]
|
|
+ ldp x25, x26, [sp, 64]
|
|
+ ldp x27, x28, [sp, 80]
|
|
+ ldp x29, x30, [sp], STACK_SIZE
|
|
+ ret
|
|
+ .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1
|
|
+
|
|
diff --git a/drv/hash_mb/sm3_mb_asimd_x4.S b/drv/hash_mb/sm3_mb_asimd_x4.S
|
|
new file mode 100644
|
|
index 0000000..975a07c
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/sm3_mb_asimd_x4.S
|
|
@@ -0,0 +1,576 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2020 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
|
|
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ .arch armv8.2-a
|
|
+ .text
|
|
+ .align 2
|
|
+ .p2align 3,,7
|
|
+
|
|
+.macro declare_var_vector_reg name:req,reg:req
|
|
+ q\name\() .req q\reg
|
|
+ v\name\() .req v\reg
|
|
+ s\name\() .req s\reg
|
|
+.endm
|
|
+
|
|
+ job0 .req x0
|
|
+ job1 .req x1
|
|
+ job2 .req x2
|
|
+ job3 .req x3
|
|
+ len .req x4
|
|
+
|
|
+ job0_data .req x5
|
|
+ job1_data .req x6
|
|
+ job2_data .req x7
|
|
+ job3_data .req x9
|
|
+
|
|
+ job0_digest .req x0
|
|
+ job1_digest .req x1
|
|
+ job2_digest .req x2
|
|
+ job3_digest .req x3
|
|
+ job0_tmp .req x10
|
|
+ job1_tmp .req x11
|
|
+ job2_tmp .req x12
|
|
+ job3_tmp .req x13
|
|
+ const_adr .req x14
|
|
+
|
|
+
|
|
+ declare_var_vector_reg msg0,0
|
|
+ declare_var_vector_reg msg1,1
|
|
+ declare_var_vector_reg msg2,2
|
|
+ declare_var_vector_reg msg3,3
|
|
+ declare_var_vector_reg msg4,4
|
|
+ declare_var_vector_reg msg5,5
|
|
+ declare_var_vector_reg msg6,6
|
|
+ declare_var_vector_reg msg7,7
|
|
+ declare_var_vector_reg msg8,8
|
|
+ declare_var_vector_reg msg9,9
|
|
+ declare_var_vector_reg msg10,10
|
|
+ declare_var_vector_reg msg11,11
|
|
+ declare_var_vector_reg msg12,12
|
|
+ declare_var_vector_reg msg13,13
|
|
+ declare_var_vector_reg msg14,14
|
|
+ declare_var_vector_reg msg15,15
|
|
+ declare_var_vector_reg msg16,16
|
|
+
|
|
+
|
|
+ declare_var_vector_reg dig_A,24
|
|
+ declare_var_vector_reg dig_B,25
|
|
+ declare_var_vector_reg dig_C,26
|
|
+ declare_var_vector_reg dig_D,27
|
|
+ declare_var_vector_reg dig_E,28
|
|
+ declare_var_vector_reg dig_F,29
|
|
+ declare_var_vector_reg dig_G,30
|
|
+ declare_var_vector_reg dig_H,31
|
|
+
|
|
+ declare_var_vector_reg TT1,17
|
|
+ declare_var_vector_reg TT2,18
|
|
+ declare_var_vector_reg SS1,19
|
|
+ declare_var_vector_reg SS2,20
|
|
+ declare_var_vector_reg tmp0,21
|
|
+ declare_var_vector_reg word_pair,23
|
|
+ declare_var_vector_reg Tj,22
|
|
+
|
|
+
|
|
+.macro rol32 target:req,reg:req,bit:req
|
|
+ ushr v\target\().4s,v\reg\().4s,32 - \bit
|
|
+ sli v\target\().4s,v\reg\().4s,\bit
|
|
+.endm
|
|
+
|
|
+// round 0-11
|
|
+.macro sm3_round_0 round:req,wp:req
|
|
+
|
|
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
|
|
+
|
|
+ add vSS1.4s,vdig_E.4s,vTj.4s
|
|
+ sli vtmp0.4s,vdig_A.4s,12
|
|
+ rev32 vmsg\round\().16b,vmsg\round\().16b
|
|
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
|
|
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
|
|
+ rol32 SS1,TT1,7
|
|
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
|
|
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
|
|
+
|
|
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
|
|
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
|
|
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
|
|
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
|
|
+
|
|
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
|
|
+ add vSS2.4s,vSS2.4s,vword_pair.4s
|
|
+ add vTT1.4s,vTT1.4s,vdig_D.4s
|
|
+ add vTT2.4s,vTT2.4s,vdig_H.4s
|
|
+ ushr vtmp0.4s,vTj.4s,32-1
|
|
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
|
|
+ sli vtmp0.4s,vTj.4s,1
|
|
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
|
|
+ mov vTj.16b,vtmp0.16b
|
|
+ //D=C
|
|
+ mov vdig_D.16b,vdig_C.16b
|
|
+ //C = ROTL32(B, 9);
|
|
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
|
|
+ sli vdig_C.4s,vdig_B.4s,9
|
|
+ //B=A
|
|
+ mov vdig_B.16b,vdig_A.16b
|
|
+ //A=TT1
|
|
+ mov vdig_A.16b,vTT1.16b
|
|
+ // H=G
|
|
+ mov vdig_H.16b,vdig_G.16b
|
|
+ //G = ROTL32(F,19)
|
|
+ rol32 dig_G,dig_F,19
|
|
+ //F = E
|
|
+ mov vdig_F.16b,vdig_E.16b
|
|
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
|
|
+ // E = P0(TT2);
|
|
+ ushr vSS2.4s, vTT2.4s, 32 - 9
|
|
+ ushr vSS1.4s, vTT2.4s, 32 - 17
|
|
+ sli vSS2.4s, vTT2.4s, 9
|
|
+ sli vSS1.4s, vTT2.4s, 17
|
|
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
|
|
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
|
|
+
|
|
+.endm
|
|
+
|
|
+
|
|
+.macro sm3_round_4 round:req,wp:req
|
|
+
|
|
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
|
|
+ add vSS1.4s,vdig_E.4s,vTj.4s
|
|
+ sli vtmp0.4s,vdig_A.4s,12
|
|
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
|
|
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
|
|
+ rol32 SS1,TT1,7
|
|
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
|
|
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
|
|
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
|
|
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
|
|
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
|
|
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
|
|
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
|
|
+ add vSS2.4s,vSS2.4s,vword_pair.4s
|
|
+ add vTT1.4s,vTT1.4s,vdig_D.4s
|
|
+ add vTT2.4s,vTT2.4s,vdig_H.4s
|
|
+ ushr vtmp0.4s,vTj.4s,32-1
|
|
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
|
|
+ sli vtmp0.4s,vTj.4s,1
|
|
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
|
|
+ mov vTj.16b,vtmp0.16b
|
|
+ //D=C
|
|
+ mov vdig_D.16b,vdig_C.16b
|
|
+ //C = ROTL32(B, 9);
|
|
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
|
|
+ sli vdig_C.4s,vdig_B.4s,9
|
|
+ //B=A
|
|
+ mov vdig_B.16b,vdig_A.16b
|
|
+ //A=TT1
|
|
+ mov vdig_A.16b,vTT1.16b
|
|
+ // H=G
|
|
+ mov vdig_H.16b,vdig_G.16b
|
|
+ //G = ROTL32(F,19)
|
|
+ rol32 dig_G,dig_F,19
|
|
+ //F = E
|
|
+ mov vdig_F.16b,vdig_E.16b
|
|
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
|
|
+ // E = P0(TT2);
|
|
+ ushr vSS2.4s, vTT2.4s, 32 - 9
|
|
+ ushr vSS1.4s, vTT2.4s, 32 - 17
|
|
+ sli vSS2.4s, vTT2.4s, 9
|
|
+ sli vSS1.4s, vTT2.4s, 17
|
|
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
|
|
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
|
|
+
|
|
+.endm
|
|
+
|
|
+//round 12-15
|
|
+.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4
|
|
+ rol32 msg\plus_4,msg\m2,15
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
|
|
+ rol32 tmp0,msg\plus_4,15
|
|
+ rol32 word_pair,msg\plus_4,23
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
|
|
+ rol32 tmp0,msg\m3,7
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
|
|
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
|
|
+ sli vtmp0.4s,vdig_A.4s,12
|
|
+ add vSS1.4s,vdig_E.4s,vTj.4s
|
|
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
|
|
+ rol32 SS1,SS2,7
|
|
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
|
|
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
|
|
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
|
|
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
|
|
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
|
|
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
|
|
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
|
|
+ add vSS2.4s,vSS2.4s,vword_pair.4s
|
|
+ add vTT1.4s,vTT1.4s,vdig_D.4s
|
|
+ add vTT2.4s,vTT2.4s,vdig_H.4s
|
|
+ ushr vtmp0.4s,vTj.4s,32-1
|
|
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
|
|
+ sli vtmp0.4s,vTj.4s,1
|
|
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
|
|
+ mov vTj.16b,vtmp0.16b
|
|
+ //D=C
|
|
+ mov vdig_D.16b,vdig_C.16b
|
|
+ //C = ROTL32(B, 9);
|
|
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
|
|
+ sli vdig_C.4s,vdig_B.4s,9
|
|
+ //B=A
|
|
+ mov vdig_B.16b,vdig_A.16b
|
|
+ //A=TT1
|
|
+ mov vdig_A.16b,vTT1.16b
|
|
+ // H=G
|
|
+ mov vdig_H.16b,vdig_G.16b
|
|
+ //G = ROTL32(F,19)
|
|
+ rol32 dig_G,dig_F,19
|
|
+ //F = E
|
|
+ mov vdig_F.16b,vdig_E.16b
|
|
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
|
|
+ // E = P0(TT2);
|
|
+ ushr vSS2.4s, vTT2.4s, 32 - 9
|
|
+ ushr vSS1.4s, vTT2.4s, 32 - 17
|
|
+ sli vSS2.4s, vTT2.4s, 9
|
|
+ sli vSS1.4s, vTT2.4s, 17
|
|
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
|
|
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
|
|
+.endm
|
|
+
|
|
+// round 16-62
|
|
+.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4
|
|
+ rol32 msg\plus_4,msg\m2,15
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
|
|
+ rol32 tmp0,msg\plus_4,15
|
|
+ rol32 word_pair,msg\plus_4,23
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
|
|
+ rol32 tmp0,msg\m3,7
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
|
|
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
|
|
+ sli vtmp0.4s,vdig_A.4s,12
|
|
+ add vSS1.4s,vdig_E.4s,vTj.4s
|
|
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
|
|
+ rol32 SS1,SS2,7
|
|
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
|
|
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
|
|
+ mov vTT2.16b,vdig_E.16b
|
|
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
|
|
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
|
|
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
|
|
+ and vTT1.16b,vTT1.16b,vdig_A.16b
|
|
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
|
|
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
|
|
+ add vSS2.4s,vSS2.4s,vword_pair.4s
|
|
+ add vTT1.4s,vTT1.4s,vdig_D.4s
|
|
+ add vTT2.4s,vTT2.4s,vdig_H.4s
|
|
+ ushr vtmp0.4s,vTj.4s,32-1
|
|
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
|
|
+ sli vtmp0.4s,vTj.4s,1
|
|
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
|
|
+ mov vTj.16b,vtmp0.16b
|
|
+ //D=C
|
|
+ mov vdig_D.16b,vdig_C.16b
|
|
+ //C = ROTL32(B, 9);
|
|
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
|
|
+ sli vdig_C.4s,vdig_B.4s,9
|
|
+ //B=A
|
|
+ mov vdig_B.16b,vdig_A.16b
|
|
+ //A=TT1
|
|
+ mov vdig_A.16b,vTT1.16b
|
|
+ // H=G
|
|
+ mov vdig_H.16b,vdig_G.16b
|
|
+ //G = ROTL32(F,19)
|
|
+ rol32 dig_G,dig_F,19
|
|
+ //F = E
|
|
+ mov vdig_F.16b,vdig_E.16b
|
|
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
|
|
+ // E = P0(TT2);
|
|
+ ushr vSS2.4s, vTT2.4s, 32 - 9
|
|
+ ushr vSS1.4s, vTT2.4s, 32 - 17
|
|
+ sli vSS2.4s, vTT2.4s, 9
|
|
+ sli vSS1.4s, vTT2.4s, 17
|
|
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
|
|
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
|
|
+.endm
|
|
+
|
|
+//round 63
|
|
+.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4
|
|
+ rol32 msg\plus_4,msg\m2,15
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
|
|
+ rol32 tmp0,msg\plus_4,15
|
|
+ rol32 word_pair,msg\plus_4,23
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
|
|
+ rol32 tmp0,msg\m3,7
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
|
|
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
|
|
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
|
|
+ sli vtmp0.4s,vdig_A.4s,12
|
|
+ add vSS1.4s,vdig_E.4s,vTj.4s
|
|
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
|
|
+ rol32 SS1,SS2,7
|
|
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
|
|
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
|
|
+
|
|
+ ldp qmsg0,qmsg1,[sp,dig_off+ 0]
|
|
+ mov vTT2.16b,vdig_E.16b
|
|
+ ldp qmsg2,qmsg3,[sp,dig_off+ 32]
|
|
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
|
|
+ ldp qmsg4,qmsg5,[sp,dig_off+ 64]
|
|
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
|
|
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
|
|
+ ldp qmsg6,qmsg7,[sp,dig_off+ 96]
|
|
+ and vTT1.16b,vTT1.16b,vdig_A.16b
|
|
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
|
|
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
|
|
+ add vSS2.4s,vSS2.4s,vword_pair.4s
|
|
+ add vTT1.4s,vTT1.4s,vdig_D.4s
|
|
+ add vTT2.4s,vTT2.4s,vdig_H.4s
|
|
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
|
|
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
|
|
+ //D=C
|
|
+ eor vdig_D.16b,vdig_C.16b,vmsg3.16b
|
|
+ //C = ROTL32(B, 9);
|
|
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
|
|
+ sli vdig_C.4s,vdig_B.4s,9
|
|
+ eor vdig_C.16b,vdig_C.16b,vmsg2.16b
|
|
+ //B=A
|
|
+ eor vdig_B.16b,vdig_A.16b,vmsg1.16b
|
|
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
|
|
+ //A=TT1
|
|
+ eor vdig_A.16b,vTT1.16b,vmsg0.16b
|
|
+ // H=G
|
|
+ eor vdig_H.16b,vdig_G.16b,vmsg7.16b
|
|
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
|
|
+ //G = ROTL32(F,19)
|
|
+ rol32 dig_G,dig_F,19
|
|
+ eor vdig_G.16b,vdig_G.16b,vmsg6.16b
|
|
+ //F = E
|
|
+ eor vdig_F.16b,vdig_E.16b,vmsg5.16b
|
|
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
|
|
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
|
|
+ // E = P0(TT2);
|
|
+ ushr vSS2.4s, vTT2.4s, 32 - 9
|
|
+ ushr vSS1.4s, vTT2.4s, 32 - 17
|
|
+ sli vSS2.4s, vTT2.4s, 9
|
|
+ sli vSS1.4s, vTT2.4s, 17
|
|
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
|
|
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
|
|
+ eor vdig_E.16b, vdig_E.16b, vmsg4.16b
|
|
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
|
|
+.endm
|
|
+
|
|
+ .set dig_off , 80
|
|
+
|
|
+#define STACK_SIZE 224
|
|
+ .global sm3_mb_asimd_x4
|
|
+ .type sm3_mb_asimd_x4, %function
|
|
+sm3_mb_asimd_x4:
|
|
+ stp x29,x30, [sp,-STACK_SIZE]!
|
|
+ cmp len,0
|
|
+ //push d8~d15
|
|
+ ldr job0_data, [job0],64
|
|
+ stp d8,d9, [sp,16]
|
|
+ ldr job1_data, [job1],64
|
|
+ stp d10,d11,[sp,32]
|
|
+ ldr job2_data, [job2],64
|
|
+ stp d12,d13,[sp,48]
|
|
+ ldr job3_data, [job3],64
|
|
+ stp d14,d15,[sp,64]
|
|
+ ble .exit_func
|
|
+
|
|
+ mov job0_tmp,job0_digest
|
|
+ mov job1_tmp,job1_digest
|
|
+ mov job2_tmp,job2_digest
|
|
+ mov job3_tmp,job3_digest
|
|
+ //load digests
|
|
+ ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16
|
|
+ ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16
|
|
+ ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16
|
|
+ adrp const_adr, .consts
|
|
+ ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16
|
|
+ add const_adr, const_adr, #:lo12:.consts
|
|
+ ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp]
|
|
+ rev32 vdig_A.16b,vdig_A.16b
|
|
+ ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp]
|
|
+ rev32 vdig_B.16b,vdig_B.16b
|
|
+ ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp]
|
|
+ rev32 vdig_C.16b,vdig_C.16b
|
|
+ ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp]
|
|
+ rev32 vdig_D.16b,vdig_D.16b
|
|
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
|
|
+ rev32 vdig_E.16b,vdig_E.16b
|
|
+ rev32 vdig_F.16b,vdig_F.16b
|
|
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
|
|
+ rev32 vdig_G.16b,vdig_G.16b
|
|
+ rev32 vdig_H.16b,vdig_H.16b
|
|
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
|
|
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
|
|
+
|
|
+.start_loop:
|
|
+ ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16
|
|
+ ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16
|
|
+ ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16
|
|
+ ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16
|
|
+ ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16
|
|
+ ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16
|
|
+ ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16
|
|
+ ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16
|
|
+ ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16
|
|
+ ldr qTj,[const_adr]
|
|
+
|
|
+ sm3_round_0 0, 4
|
|
+
|
|
+ ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16
|
|
+ sm3_round_0 1, 5
|
|
+
|
|
+ ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16
|
|
+ sm3_round_0 2, 6
|
|
+ ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16
|
|
+ sm3_round_0 3, 7
|
|
+
|
|
+ ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16
|
|
+
|
|
+ sm3_round_4 4, 8
|
|
+ ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16
|
|
+ sm3_round_4 5, 9
|
|
+ ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16
|
|
+ sm3_round_4 6,10
|
|
+ ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16
|
|
+ sm3_round_4 7,11
|
|
+ sm3_round_4 8,12
|
|
+ sm3_round_4 9,13
|
|
+ sm3_round_4 10,14
|
|
+ sm3_round_4 11,15
|
|
+
|
|
+ sm3_round_12 12,16, 0, 7,13, 3,10 //12
|
|
+ sm3_round_12 13, 0, 1, 8,14, 4,11 //13
|
|
+ sm3_round_12 14, 1, 2, 9,15, 5,12 //14
|
|
+ sm3_round_12 15, 2, 3,10,16, 6,13 //15
|
|
+
|
|
+ ldr qTj,[const_adr,16]
|
|
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //16
|
|
+#if 0
|
|
+ stp sdig_A,sdig_B,[job0_digest]
|
|
+ stp sdig_C,sdig_D,[job0_digest,8]
|
|
+ stp sdig_E,sdig_F,[job0_digest,16]
|
|
+ stp sdig_G,sdig_H,[job0_digest,24]
|
|
+ b .exit_func
|
|
+#endif
|
|
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //17
|
|
+
|
|
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //18
|
|
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //19
|
|
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //20
|
|
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //21
|
|
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //22
|
|
+ sm3_round_16 6,10,11, 1, 7,14, 4 //23
|
|
+ sm3_round_16 7,11,12, 2, 8,15, 5 //24
|
|
+ sm3_round_16 8,12,13, 3, 9,16, 6 //25
|
|
+ sm3_round_16 9,13,14, 4,10, 0, 7 //26
|
|
+ sm3_round_16 10,14,15, 5,11, 1, 8 //27
|
|
+ sm3_round_16 11,15,16, 6,12, 2, 9 //28
|
|
+ sm3_round_16 12,16, 0, 7,13, 3,10 //29
|
|
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //30
|
|
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //31
|
|
+ sm3_round_16 15, 2, 3,10,16, 6,13 //32
|
|
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //33
|
|
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //34
|
|
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //35
|
|
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //36
|
|
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //37
|
|
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //38
|
|
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //39
|
|
+ sm3_round_16 6,10,11, 1, 7,14, 4 //40
|
|
+ sm3_round_16 7,11,12, 2, 8,15, 5 //41
|
|
+ sm3_round_16 8,12,13, 3, 9,16, 6 //42
|
|
+ sm3_round_16 9,13,14, 4,10, 0, 7 //43
|
|
+ sm3_round_16 10,14,15, 5,11, 1, 8 //44
|
|
+ sm3_round_16 11,15,16, 6,12, 2, 9 //45
|
|
+ sm3_round_16 12,16, 0, 7,13, 3,10 //46
|
|
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //47
|
|
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //48
|
|
+ sm3_round_16 15, 2, 3,10,16, 6,13 //49
|
|
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //50
|
|
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //51
|
|
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //52
|
|
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //53
|
|
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //54
|
|
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //55
|
|
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //56
|
|
+ sm3_round_16 6,10,11, 1, 7,14, 4 //57
|
|
+ sm3_round_16 7,11,12, 2, 8,15, 5 //58
|
|
+ sm3_round_16 8,12,13, 3, 9,16, 6 //59
|
|
+ sm3_round_16 9,13,14, 4,10, 0, 7 //60
|
|
+ sm3_round_16 10,14,15, 5,11, 1, 8 //61
|
|
+ sm3_round_16 11,15,16, 6,12, 2, 9 //62
|
|
+ sm3_round_63 12,16, 0, 7,13, 3,10 //63
|
|
+
|
|
+ subs len,len,1
|
|
+ bne .start_loop
|
|
+
|
|
+ //save digests with big endian
|
|
+ rev32 vdig_A.16b,vdig_A.16b
|
|
+ rev32 vdig_B.16b,vdig_B.16b
|
|
+ rev32 vdig_C.16b,vdig_C.16b
|
|
+ rev32 vdig_D.16b,vdig_D.16b
|
|
+ st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16
|
|
+ rev32 vdig_E.16b,vdig_E.16b
|
|
+ rev32 vdig_F.16b,vdig_F.16b
|
|
+ st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16
|
|
+ rev32 vdig_G.16b,vdig_G.16b
|
|
+ rev32 vdig_H.16b,vdig_H.16b
|
|
+ st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16
|
|
+ st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16
|
|
+ st4 {vdig_E.s-vdig_H.s}[0],[job0_digest]
|
|
+ st4 {vdig_E.s-vdig_H.s}[1],[job1_digest]
|
|
+ st4 {vdig_E.s-vdig_H.s}[2],[job2_digest]
|
|
+ st4 {vdig_E.s-vdig_H.s}[3],[job3_digest]
|
|
+
|
|
+.exit_func:
|
|
+ ldp d8, d9, [sp,16]
|
|
+ ldp d10,d11,[sp,32]
|
|
+ ldp d12,d13,[sp,48]
|
|
+ ldp d14,d15,[sp,64]
|
|
+ ldp x29, x30, [sp], STACK_SIZE
|
|
+ ret
|
|
+.consts:
|
|
+ .word 0x79cc4519
|
|
+ .word 0x79cc4519
|
|
+ .word 0x79cc4519
|
|
+ .word 0x79cc4519
|
|
+ .word 0x9d8a7a87
|
|
+ .word 0x9d8a7a87
|
|
+ .word 0x9d8a7a87
|
|
+ .word 0x9d8a7a87
|
|
+ .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4
|
|
+
|
|
diff --git a/drv/hash_mb/sm3_mb_sve.S b/drv/hash_mb/sm3_mb_sve.S
|
|
new file mode 100644
|
|
index 0000000..7dd2428
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/sm3_mb_sve.S
|
|
@@ -0,0 +1,161 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2022 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ .arch armv8.2-a+sve
|
|
+
|
|
+.macro copy_mb_16words vecs:req,dest:req
|
|
+ mov src,\vecs
|
|
+ mov dst,\dest
|
|
+ mov ctr,lanes
|
|
+1:
|
|
+ ldr tmp,[src],8
|
|
+ ldr tmp,[tmp]
|
|
+ add tmp,tmp,block_ctr,lsl 6
|
|
+ ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp]
|
|
+ st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64
|
|
+ subs ctr,ctr,1
|
|
+ b.ne 1b
|
|
+.endm
|
|
+
|
|
+.macro load_words windex:req
|
|
+ .if \windex == 0
|
|
+ mov tmpw,16
|
|
+ index VOFFS.s,0,tmpw
|
|
+ copy_mb_16words job_vec,databuf
|
|
+ mov dataptr,databuf
|
|
+ .endif
|
|
+ ld1w { WORD\windex\().s}, p0/z, [dataptr, VOFFS.s, UXTW 2]
|
|
+ add dataptr,dataptr,4
|
|
+.endm
|
|
+
|
|
+#include "sm3_sve_common.S"
|
|
+
|
|
+/* int sm3_mb_sve_max_lanes()
|
|
+ * return : max lanes of SVE vector
|
|
+ */
|
|
+ .global sm3_mb_sve_max_lanes
|
|
+ .type sm3_mb_sve_max_lanes, %function
|
|
+sm3_mb_sve_max_lanes:
|
|
+ cntw x0
|
|
+ ret
|
|
+ .size sm3_mb_sve_max_lanes, .-sm3_mb_sve_max_lanes
|
|
+/*
|
|
+ * void sm3_mb_sve(int blocks, int total_lanes, SM3_JOB **job_vec)
|
|
+ */
|
|
+ num_blocks .req w0
|
|
+ total_lanes .req w1
|
|
+ job_vec .req x2
|
|
+ lanes .req x4
|
|
+ src .req x5
|
|
+ dst .req x6
|
|
+ lane_offset .req w7
|
|
+ lane_offset_x .req x7
|
|
+ tmp .req x8
|
|
+ tmpw .req w8
|
|
+ block_ctr .req x9
|
|
+ block_ctr_w .req w9
|
|
+ savedsp .req x10
|
|
+ databuf .req x11
|
|
+ dataptr .req x12
|
|
+ efgh_buf .req x12
|
|
+ ctr .req x13
|
|
+ abcd_buf .req x14
|
|
+ sm3const_adr .req x15
|
|
+
|
|
+ .global sm3_mb_sve
|
|
+ .type sm3_mb_sve, %function
|
|
+sm3_mb_sve:
|
|
+ cbz num_blocks,.return
|
|
+ sm3_sve_save_stack
|
|
+ mov savedsp,sp
|
|
+ mov lane_offset, #0
|
|
+ whilelo p0.s, wzr, total_lanes
|
|
+ // reserve (32 * max lanes) for abcdefgh buf
|
|
+ cntw tmp
|
|
+ lsl tmp, tmp, 5
|
|
+ sub abcd_buf,sp,tmp
|
|
+ mov tmp,63
|
|
+ bic abcd_buf,abcd_buf,tmp
|
|
+ // reserve (64 * lanes) for data buf
|
|
+ cntp lanes,p0,p0.s
|
|
+ lsl tmp,lanes,6
|
|
+ sub databuf,abcd_buf,tmp
|
|
+ mov sp,databuf
|
|
+ adr sm3const_adr,SM3_CONSTS
|
|
+.seg_loops:
|
|
+ mov src,job_vec
|
|
+ mov dst,abcd_buf
|
|
+ cntp lanes,p0,p0.s
|
|
+ add efgh_buf,abcd_buf,lanes,lsl 4
|
|
+ mov ctr,lanes
|
|
+.ldr_hash:
|
|
+ ldr tmp,[src],8
|
|
+ add tmp,tmp,64
|
|
+ ld1 {v0.16b, v1.16b},[tmp]
|
|
+ rev32 v0.16b,v0.16b
|
|
+ rev32 v1.16b,v1.16b
|
|
+ st1 {v0.16b},[dst],16
|
|
+ st1 {v1.16b},[efgh_buf],16
|
|
+ subs ctr,ctr,1
|
|
+ bne .ldr_hash
|
|
+ ld4w {VA.s,VB.s,VC.s,VD.s},p0/z,[abcd_buf]
|
|
+ add tmp,abcd_buf,lanes,lsl 4
|
|
+ ld4w {VE.s,VF.s,VG.s,VH.s},p0/z,[tmp]
|
|
+ mov block_ctr,0
|
|
+ // always unpredicated SVE mode in current settings
|
|
+ pred_mode=0
|
|
+.block_loop:
|
|
+ sm3_single
|
|
+ add block_ctr, block_ctr, 1
|
|
+ cmp block_ctr_w,num_blocks
|
|
+ bne .block_loop
|
|
+ st4w {VA.s,VB.s,VC.s,VD.s},p0,[abcd_buf]
|
|
+ add efgh_buf,abcd_buf,lanes,lsl 4
|
|
+ st4w {VE.s,VF.s,VG.s,VH.s},p0,[efgh_buf]
|
|
+ mov dst,job_vec
|
|
+ mov src,abcd_buf
|
|
+ add job_vec,job_vec,lanes,lsl 3
|
|
+ mov ctr,lanes
|
|
+.str_hash:
|
|
+ ld1 {v0.16b},[src],16
|
|
+ ld1 {v1.16b},[efgh_buf],16
|
|
+ rev32 v0.16b,v0.16b
|
|
+ rev32 v1.16b,v1.16b
|
|
+ ldr tmp,[dst],8
|
|
+ add tmp,tmp,64
|
|
+ st1 {v0.16b,v1.16b},[tmp]
|
|
+ subs ctr,ctr,1
|
|
+ bne .str_hash
|
|
+ incw lane_offset_x
|
|
+ whilelo p0.s, lane_offset, total_lanes
|
|
+ b.mi .seg_loops
|
|
+ mov sp,savedsp
|
|
+ sm3_sve_restore_stack
|
|
+.return:
|
|
+ ret
|
|
+ .size sm3_mb_sve, .-sm3_mb_sve
|
|
diff --git a/drv/hash_mb/sm3_sve_common.S b/drv/hash_mb/sm3_sve_common.S
|
|
new file mode 100644
|
|
index 0000000..3d54952
|
|
--- /dev/null
|
|
+++ b/drv/hash_mb/sm3_sve_common.S
|
|
@@ -0,0 +1,505 @@
|
|
+/**********************************************************************
|
|
+ Copyright(c) 2022 Arm Corporation All rights reserved.
|
|
+
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
+ modification, are permitted provided that the following conditions
|
|
+ are met:
|
|
+ * Redistributions of source code must retain the above copyright
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in
|
|
+ the documentation and/or other materials provided with the
|
|
+ distribution.
|
|
+ * Neither the name of Arm Corporation nor the names of its
|
|
+ contributors may be used to endorse or promote products derived
|
|
+ from this software without specific prior written permission.
|
|
+
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+**********************************************************************/
|
|
+ VA .req z0
|
|
+ VB .req z1
|
|
+ VC .req z2
|
|
+ VD .req z3
|
|
+ VE .req z4
|
|
+ VF .req z5
|
|
+ VG .req z6
|
|
+ VH .req z7
|
|
+ TMPV0 .req v8
|
|
+ TMPV1 .req v9
|
|
+ TMPV2 .req v10
|
|
+ TMPV3 .req v11
|
|
+ WORD0 .req z8
|
|
+ WORD1 .req z9
|
|
+ WORD2 .req z10
|
|
+ WORD3 .req z11
|
|
+ WORD4 .req z12
|
|
+ WORD5 .req z13
|
|
+ WORD6 .req z14
|
|
+ WORD7 .req z15
|
|
+ WORD8 .req z16
|
|
+ WORD9 .req z17
|
|
+ WORD10 .req z18
|
|
+ WORD11 .req z19
|
|
+ WORD12 .req z20
|
|
+ WORD13 .req z21
|
|
+ WORD14 .req z22
|
|
+ WORD15 .req z23
|
|
+ WORD16 .req z24
|
|
+ VOFFS .req z24 // reuse WORD16
|
|
+ SS1 .req z25
|
|
+ SS2 .req z26
|
|
+ VT .req z26 // reuse SS2
|
|
+ TT2 .req z27
|
|
+ VT1 .req z28
|
|
+ VT2 .req z29
|
|
+ VT3 .req z30
|
|
+ VT4 .req z31
|
|
+ VZERO .req z31
|
|
+ TT .req z0
|
|
+
|
|
+.macro sve_op inst:req,regd,args:vararg
|
|
+ .if pred_mode == 1
|
|
+ \inst \regd,p0/m,\args
|
|
+ .else
|
|
+ \inst \regd,\args
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro sve_bitop inst:req,regd:req,regm:req
|
|
+ .if pred_mode == 1
|
|
+ \inst \regd\().s,p0/m,\regd\().s,\regm\().s
|
|
+ .else
|
|
+ \inst \regd\().d,\regd\().d,\regm\().d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro rotate_left0 out:req,in:req,tmp:req,bits:req,args:vararg
|
|
+ .if have_sve2 == 0
|
|
+ lsl \tmp\().s,\in\().s,\bits
|
|
+ .else
|
|
+ movprfx \out\().d,\in\().d
|
|
+ xar \out\().s,\out\().s,VZERO.s,32-\bits
|
|
+ .endif
|
|
+
|
|
+ .ifnb \args
|
|
+ rotate_left0 \args
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro rotate_left1 out:req,in:req,tmp:req,bits:req,args:vararg
|
|
+ .if have_sve2 == 0
|
|
+ lsr \out\().s,\in\().s,32-\bits
|
|
+ .endif
|
|
+
|
|
+ .ifnb \args
|
|
+ rotate_left1 \args
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro rotate_left2 out:req,in:req,tmp:req,bits:req,args:vararg
|
|
+ .if have_sve2 == 0
|
|
+ orr \out\().d,\out\().d,\tmp\().d
|
|
+ .endif
|
|
+
|
|
+ .ifnb \args
|
|
+ rotate_left2 \args
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro rotate_left args:vararg
|
|
+ rotate_left0 \args
|
|
+ rotate_left1 \args
|
|
+ rotate_left2 \args
|
|
+.endm
|
|
+
|
|
+.macro SVE_EOR3 rd:req,r1:req,r2:req
|
|
+ .if have_sve2 == 0
|
|
+ sve_bitop eor,\rd,\r1
|
|
+ sve_bitop eor,\rd,\r2
|
|
+ .else
|
|
+ eor3 \rd\().d,\rd\().d,\r1\().d,\r2\().d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro FUNC_EOR3 ret:req,x:req,y:req,z:req
|
|
+ .if have_sve2 == 0
|
|
+ eor \ret\().d,\x\().d,\y\().d
|
|
+ sve_bitop eor,\ret,\z
|
|
+ .else
|
|
+ movprfx \ret\().d,\x\().d
|
|
+ eor3 \ret\().d,\ret\().d,\y\().d,\z\().d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro FUNC_FF windex:req,ret:req,x:req,y:req,z:req,tmp1:req,tmp2:req
|
|
+ and \ret\().d,\x\().d,\y\().d
|
|
+ and \tmp1\().d,\x\().d,\z\().d
|
|
+ and \tmp2\().d,\y\().d,\z\().d
|
|
+ sve_bitop orr,\ret,\tmp1
|
|
+ sve_bitop orr,\ret,\tmp2
|
|
+.endm
|
|
+
|
|
+.macro FUNC_BSL ret:req,x:req,y:req,z:req,tmp:req
|
|
+ .if have_sve2 == 0
|
|
+ bic \ret\().d,\z\().d,\x\().d
|
|
+ and \tmp\().d,\x\().d,\y\().d
|
|
+ sve_bitop orr,\ret,\tmp
|
|
+ .else
|
|
+ movprfx \ret\().d,\x\().d
|
|
+ bsl \ret\().d,\ret\().d,\y\().d,\z\().d
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.altmacro
|
|
+.macro load_next_words windex
|
|
+ .if \windex < 16
|
|
+ load_words \windex
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro SM3_STEP_00_11 windex:req,w:req,w4:req
|
|
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
|
|
+ ld1rw {VT2.s},p0/z,[sm3const_adr,\windex * 4]
|
|
+ rotate_left SS1,VA,VT1,12
|
|
+ mov SS2.s,p0/m,SS1.s
|
|
+ sve_op add,SS1.s,SS1.s,VE.s
|
|
+ sve_op add,SS1.s,SS1.s,VT2.s
|
|
+ rotate_left SS1,SS1,VT2,7
|
|
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
|
|
+ add VT2.s,\w\().s,VH.s
|
|
+ FUNC_EOR3 TT2,VE,VF,VG
|
|
+ // SS2 = SS1 ^ rol32(a, 12)
|
|
+ sve_bitop eor,SS2,SS1
|
|
+ sve_op add,TT2.s,TT2.s,VT2.s
|
|
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
|
|
+ FUNC_EOR3 VH,VA,VB,VC
|
|
+ eor VT1.d,\w\().d,\w4\().d
|
|
+ sve_op add,VH.s,VH.s,VD.s
|
|
+ sve_op add,VH.s,VH.s,VT1.s
|
|
+ add VD.s,TT2.s,SS1.s
|
|
+ sve_op add,VH.s,VH.s,SS2.s
|
|
+ // d = P0(TT2)
|
|
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17
|
|
+ SVE_EOR3 VD,VT1,VT3
|
|
+ // b = rol32(b, 9)
|
|
+ // f = rol32(f, 19)
|
|
+ rotate_left VB,VB,VT3,9,VF,VF,VT4,19
|
|
+.endm
|
|
+
|
|
+.macro SM3_STEP_12_15 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req
|
|
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
|
|
+ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12
|
|
+ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4]
|
|
+ mov TT2.s,p0/m,SS1.s
|
|
+ sve_bitop eor,VT,\w16
|
|
+ sve_op add,SS1.s,SS1.s,VE.s
|
|
+ sve_bitop eor,VT,\w9
|
|
+ sve_op add,SS1.s,SS1.s,VT1.s
|
|
+ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23
|
|
+ SVE_EOR3 VT,VT1,VT3
|
|
+ rotate_left SS1,SS1,VT2,7
|
|
+ sve_bitop eor,\w4,VT
|
|
+ // SS2 = SS1 ^ rol32(a, 12)
|
|
+ eor SS2.d,TT2.d,SS1.d
|
|
+ sve_bitop eor,\w4,\w6
|
|
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
|
|
+ FUNC_EOR3 TT2,VE,VF,VG
|
|
+ add VT1.s,\w\().s,VH.s
|
|
+ sve_op add,TT2.s,TT2.s,VT1.s
|
|
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
|
|
+ FUNC_EOR3 VH,VA,VB,VC
|
|
+ eor VT1.d,\w\().d,\w4\().d
|
|
+ sve_op add,VH.s,VH.s,VD.s
|
|
+ // b = rol32(b, 9)
|
|
+ // f = rol32(f, 19)
|
|
+ rotate_left VB,VB,VT3,9
|
|
+ sve_op add,VH.s,VH.s,VT1.s
|
|
+ add VD.s,TT2.s,SS1.s
|
|
+ sve_op add,VH.s,VH.s,SS2.s
|
|
+ // d = P0(TT2)
|
|
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17,VF,VF,TT2,19
|
|
+ SVE_EOR3 VD,VT1,VT3
|
|
+.endm
|
|
+
|
|
+.macro SM3_STEP_16_62 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req
|
|
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
|
|
+ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12
|
|
+ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4]
|
|
+ mov TT2.s,p0/m,SS1.s
|
|
+ sve_bitop eor,VT,\w16
|
|
+ sve_op add,SS1.s,SS1.s,VE.s
|
|
+ sve_bitop eor,VT,\w9
|
|
+ sve_op add,SS1.s,SS1.s,VT1.s
|
|
+ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23
|
|
+ SVE_EOR3 \w4,VT,VT1
|
|
+ rotate_left SS1,SS1,VT2,7
|
|
+ sve_bitop eor,\w4,VT3
|
|
+ // SS2 = SS1 ^ rol32(a, 12)
|
|
+ eor SS2.d,TT2.d,SS1.d
|
|
+ sve_bitop eor,\w4,\w6
|
|
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
|
|
+ sve_op add,SS1.s,SS1.s,\w\().s
|
|
+ FUNC_BSL TT2,VE,VF,VG,VT1
|
|
+ sve_op add,SS1.s,SS1.s,VH.s
|
|
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
|
|
+ FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2
|
|
+ eor VT1.d,\w\().d,\w4\().d
|
|
+ sve_op add,VH.s,VH.s,VD.s
|
|
+ // b = rol32(b, 9)
|
|
+ // f = rol32(f, 19)
|
|
+ rotate_left VB,VB,VT2,9,VF,VF,VT4,19
|
|
+ sve_op add,VH.s,VH.s,VT1.s
|
|
+ add VD.s,TT2.s,SS1.s
|
|
+ sve_op add,VH.s,VH.s,SS2.s
|
|
+ // d = P0(TT2)
|
|
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17
|
|
+ SVE_EOR3 VD,VT1,VT3
|
|
+.endm
|
|
+
|
|
+.macro SM3_STEP_63 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req
|
|
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
|
|
+ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12
|
|
+ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4]
|
|
+ mov TT2.s,p0/m,SS1.s
|
|
+ sve_bitop eor,VT,\w16
|
|
+ sve_op add,SS1.s,SS1.s,VE.s
|
|
+ sve_bitop eor,VT,\w9
|
|
+ sve_op add,SS1.s,SS1.s,VT1.s
|
|
+ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23
|
|
+ SVE_EOR3 VT,VT1,VT3
|
|
+ rotate_left SS1,SS1,VT2,7
|
|
+ sve_bitop eor,\w4,VT
|
|
+ // SS2 = SS1 ^ rol32(a, 12)
|
|
+ eor SS2.d,TT2.d,SS1.d
|
|
+ sve_bitop eor,\w4,\w6
|
|
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
|
|
+ FUNC_BSL TT2,VE,VF,VG,VT1
|
|
+ add VT1.s,\w\().s,VH.s
|
|
+ .if \windex == 63
|
|
+ ld1w {WORD0.s},p0/z,[abcd_buf, 0, MUL VL]
|
|
+ ld1w {WORD1.s},p0/z,[abcd_buf, 1, MUL VL]
|
|
+ ld1w {WORD2.s},p0/z,[abcd_buf, 2, MUL VL]
|
|
+ ld1w {WORD3.s},p0/z,[abcd_buf, 3, MUL VL]
|
|
+ ld1w {WORD4.s},p0/z,[abcd_buf, 4, MUL VL]
|
|
+ ld1w {WORD5.s},p0/z,[abcd_buf, 5, MUL VL]
|
|
+ ld1w {WORD6.s},p0/z,[abcd_buf, 6, MUL VL]
|
|
+ ld1w {WORD7.s},p0/z,[abcd_buf, 7, MUL VL]
|
|
+ .endif
|
|
+ sve_op add,TT2.s,TT2.s,VT1.s
|
|
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
|
|
+ FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2
|
|
+ eor VT1.d,\w\().d,\w4\().d
|
|
+ sve_op add,VH.s,VH.s,VD.s
|
|
+ // b = rol32(b, 9)
|
|
+ // f = rol32(f, 19)
|
|
+ rotate_left VB,VB,VT2,9,VF,VF,VT4,19
|
|
+ sve_op add,VH.s,VH.s,VT1.s
|
|
+ add VD.s,TT2.s,SS1.s
|
|
+ sve_bitop eor,VA,WORD1
|
|
+ sve_bitop eor,VB,WORD2
|
|
+ sve_bitop eor,VC,WORD3
|
|
+ // d = P0(TT2)
|
|
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17
|
|
+ sve_bitop eor,VF,WORD6
|
|
+ SVE_EOR3 VD,VT1,VT3
|
|
+ sve_bitop eor,VG,WORD7
|
|
+ sve_bitop eor,VD,WORD4
|
|
+ sve_op add,VH.s,VH.s,SS2.s
|
|
+ sve_bitop eor,VE,WORD5
|
|
+ sve_bitop eor,VH,WORD0
|
|
+.endm
|
|
+
|
|
+.macro SWAP_STATES
|
|
+ .unreq TT
|
|
+ TT .req VH
|
|
+ .unreq VH
|
|
+ VH .req VG
|
|
+ .unreq VG
|
|
+ VG .req VF
|
|
+ .unreq VF
|
|
+ VF .req VE
|
|
+ .unreq VE
|
|
+ VE .req VD
|
|
+ .unreq VD
|
|
+ VD .req VC
|
|
+ .unreq VC
|
|
+ VC .req VB
|
|
+ .unreq VB
|
|
+ VB .req VA
|
|
+ .unreq VA
|
|
+ VA .req TT
|
|
+.endm
|
|
+
|
|
+.altmacro
|
|
+.macro SM3_STEP_WRAPPER windex:req,idx:req,idx4:req,idx16,idx13,idx9,idx6,idx3
|
|
+ .if \windex <= 11
|
|
+ revb WORD\idx4\().s, p0/m, WORD\idx4\().s
|
|
+ next=\idx4+1
|
|
+ load_next_words %next
|
|
+ SM3_STEP_00_11 \windex,WORD\idx\(),WORD\idx4\()
|
|
+ .else
|
|
+ .if \windex < 16
|
|
+ SM3_STEP_12_15 \windex,WORD\idx\(),\
|
|
+ WORD\idx4\(),WORD\idx16\(),WORD\idx13\(),\
|
|
+ WORD\idx9\(),WORD\idx6\(),WORD\idx3\()
|
|
+ .else
|
|
+ .if \windex == 63
|
|
+ SM3_STEP_63 \windex,WORD\idx\(),WORD\idx4\(),\
|
|
+ WORD\idx16\(),WORD\idx13\(),WORD\idx9\(),\
|
|
+ WORD\idx6\(),WORD\idx3\()
|
|
+ .else
|
|
+ SM3_STEP_16_62 \windex,WORD\idx\(),WORD\idx4\(),\
|
|
+ WORD\idx16\(),WORD\idx13\(),WORD\idx9\(),\
|
|
+ WORD\idx6\(),WORD\idx3\()
|
|
+ .endif
|
|
+ .endif
|
|
+ .endif
|
|
+.endm
|
|
+
|
|
+.macro exec_step windex:req
|
|
+ .if \windex <= 11
|
|
+ idx4=\windex+4
|
|
+ SM3_STEP_WRAPPER \windex,\windex,%idx4
|
|
+ .else
|
|
+ idxp4=\windex + 4
|
|
+ idx4=idxp4 % 17
|
|
+ idx16=(idxp4 - 16) % 17
|
|
+ idx13=(idxp4 - 13) % 17
|
|
+ idx9=(idxp4 - 9) % 17
|
|
+ idx6=(idxp4 - 6) % 17
|
|
+ idx3=(idxp4 - 3) % 17
|
|
+ idx=\windex % 17
|
|
+ SM3_STEP_WRAPPER \windex,%idx,%idx4,%idx16,%idx13,%idx9,%idx6,%idx3
|
|
+ .endif
|
|
+ SWAP_STATES
|
|
+.endm
|
|
+
|
|
+.macro sm3_exec
|
|
+ current_step=0
|
|
+ .rept 64
|
|
+ exec_step %current_step
|
|
+ current_step=current_step+1
|
|
+ .endr
|
|
+.endm
|
|
+
|
|
+.macro sm3_single sve2:vararg
|
|
+ .ifnb \sve2
|
|
+ have_sve2 = 1
|
|
+ .else
|
|
+ have_sve2=0
|
|
+ .endif
|
|
+ st1w {VA.s},p0,[abcd_buf, 0, MUL VL]
|
|
+ st1w {VB.s},p0,[abcd_buf, 1, MUL VL]
|
|
+ st1w {VC.s},p0,[abcd_buf, 2, MUL VL]
|
|
+ st1w {VD.s},p0,[abcd_buf, 3, MUL VL]
|
|
+ st1w {VE.s},p0,[abcd_buf, 4, MUL VL]
|
|
+ st1w {VF.s},p0,[abcd_buf, 5, MUL VL]
|
|
+ st1w {VG.s},p0,[abcd_buf, 6, MUL VL]
|
|
+ st1w {VH.s},p0,[abcd_buf, 7, MUL VL]
|
|
+ load_words 0
|
|
+ load_words 1
|
|
+ load_words 2
|
|
+ load_words 3
|
|
+ load_words 4
|
|
+ revb WORD0.s, p0/m, WORD0.s
|
|
+ revb WORD1.s, p0/m, WORD1.s
|
|
+ revb WORD2.s, p0/m, WORD2.s
|
|
+ revb WORD3.s, p0/m, WORD3.s
|
|
+ .if have_sve2 == 1
|
|
+ mov VZERO.s,p0/m,#0
|
|
+ .endif
|
|
+ sm3_exec
|
|
+.endm
|
|
+
|
|
+.macro sm3_sve_save_stack
|
|
+ stp d8,d9,[sp, -64]!
|
|
+ stp d10,d11,[sp, 16]
|
|
+ stp d12,d13,[sp, 32]
|
|
+ stp d14,d15,[sp, 48]
|
|
+.endm
|
|
+
|
|
+.macro sm3_sve_restore_stack
|
|
+ ldp d10,d11,[sp, 16]
|
|
+ ldp d12,d13,[sp, 32]
|
|
+ ldp d14,d15,[sp, 48]
|
|
+ ldp d8,d9,[sp],64
|
|
+.endm
|
|
+
|
|
+ .section .rodata.cst16,"aM",@progbits,16
|
|
+ .align 16
|
|
+SM3_CONSTS:
|
|
+ .word 0x79CC4519
|
|
+ .word 0xF3988A32
|
|
+ .word 0xE7311465
|
|
+ .word 0xCE6228CB
|
|
+ .word 0x9CC45197
|
|
+ .word 0x3988A32F
|
|
+ .word 0x7311465E
|
|
+ .word 0xE6228CBC
|
|
+ .word 0xCC451979
|
|
+ .word 0x988A32F3
|
|
+ .word 0x311465E7
|
|
+ .word 0x6228CBCE
|
|
+ .word 0xC451979C
|
|
+ .word 0x88A32F39
|
|
+ .word 0x11465E73
|
|
+ .word 0x228CBCE6
|
|
+ .word 0x9D8A7A87
|
|
+ .word 0x3B14F50F
|
|
+ .word 0x7629EA1E
|
|
+ .word 0xEC53D43C
|
|
+ .word 0xD8A7A879
|
|
+ .word 0xB14F50F3
|
|
+ .word 0x629EA1E7
|
|
+ .word 0xC53D43CE
|
|
+ .word 0x8A7A879D
|
|
+ .word 0x14F50F3B
|
|
+ .word 0x29EA1E76
|
|
+ .word 0x53D43CEC
|
|
+ .word 0xA7A879D8
|
|
+ .word 0x4F50F3B1
|
|
+ .word 0x9EA1E762
|
|
+ .word 0x3D43CEC5
|
|
+ .word 0x7A879D8A
|
|
+ .word 0xF50F3B14
|
|
+ .word 0xEA1E7629
|
|
+ .word 0xD43CEC53
|
|
+ .word 0xA879D8A7
|
|
+ .word 0x50F3B14F
|
|
+ .word 0xA1E7629E
|
|
+ .word 0x43CEC53D
|
|
+ .word 0x879D8A7A
|
|
+ .word 0x0F3B14F5
|
|
+ .word 0x1E7629EA
|
|
+ .word 0x3CEC53D4
|
|
+ .word 0x79D8A7A8
|
|
+ .word 0xF3B14F50
|
|
+ .word 0xE7629EA1
|
|
+ .word 0xCEC53D43
|
|
+ .word 0x9D8A7A87
|
|
+ .word 0x3B14F50F
|
|
+ .word 0x7629EA1E
|
|
+ .word 0xEC53D43C
|
|
+ .word 0xD8A7A879
|
|
+ .word 0xB14F50F3
|
|
+ .word 0x629EA1E7
|
|
+ .word 0xC53D43CE
|
|
+ .word 0x8A7A879D
|
|
+ .word 0x14F50F3B
|
|
+ .word 0x29EA1E76
|
|
+ .word 0x53D43CEC
|
|
+ .word 0xA7A879D8
|
|
+ .word 0x4F50F3B1
|
|
+ .word 0x9EA1E762
|
|
+ .word 0x3D43CEC5
|
|
+
|
|
--
|
|
2.25.1
|
|
|