369 lines
11 KiB
Diff
369 lines
11 KiB
Diff
|
|
From 64c66455fef1c908cc8f06a2b71aa2fd71806218 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Yixing Liu <liuyixing1@huawei.com>
|
||
|
|
Date: Wed, 15 Dec 2021 16:42:30 +0800
|
||
|
|
Subject: [PATCH 7/8] libhns: Add support for direct wqe
|
||
|
|
|
||
|
|
The current write wqe mechanism is to write to DDR first, and then notify
|
||
|
|
the hardware through doorbell to read the data. Direct wqe is a mechanism
|
||
|
|
to fill wqe directly into the hardware. In the case of light load, the wqe
|
||
|
|
will be filled into pcie bar space of the hardware, this will reduce one
|
||
|
|
memory access operation and therefore reduce the latency. SIMD instructions
|
||
|
|
allows cpu to write the 512 bits at one time to device memory, thus it can
|
||
|
|
be used for posting direct wqe.
|
||
|
|
|
||
|
|
The process of post send of HIP08/09:
|
||
|
|
|
||
|
|
+-----------+
|
||
|
|
| post send |
|
||
|
|
+-----+-----+
|
||
|
|
|
|
||
|
|
+-----+-----+
|
||
|
|
| write WQE |
|
||
|
|
+-----+-----+
|
||
|
|
|
|
||
|
|
| udma_to_device_barrier()
|
||
|
|
|
|
||
|
|
+-----+-----+ Y +-----------+ N
|
||
|
|
| HIP09 ? +------+ multi WR ?+-------------+
|
||
|
|
+-----+-----+ +-----+-----+ |
|
||
|
|
| N | Y |
|
||
|
|
+-----+-----+ +-----+-----+ +--------+--------+
|
||
|
|
| ring DB | | ring DB | |direct WQE (ST4) |
|
||
|
|
+-----------+ +-----------+ +-----------------+
|
||
|
|
|
||
|
|
Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
|
||
|
|
Signed-off-by: Lang Cheng <chenglang@huawei.com>
|
||
|
|
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
|
||
|
|
---
|
||
|
|
providers/hns/hns_roce_u.h | 5 +++-
|
||
|
|
providers/hns/hns_roce_u_hw_v2.c | 43 ++++++++++++++++++++++++++------
|
||
|
|
providers/hns/hns_roce_u_hw_v2.h | 31 +++++++++++++----------
|
||
|
|
providers/hns/hns_roce_u_verbs.c | 26 +++++++++++++++++--
|
||
|
|
util/mmio.h | 27 +++++++++++++++++++-
|
||
|
|
5 files changed, 107 insertions(+), 25 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
|
||
|
|
index b3f48113..37711363 100644
|
||
|
|
--- a/providers/hns/hns_roce_u.h
|
||
|
|
+++ b/providers/hns/hns_roce_u.h
|
||
|
|
@@ -80,6 +80,8 @@
|
||
|
|
|
||
|
|
#define INVALID_SGE_LENGTH 0x80000000
|
||
|
|
|
||
|
|
+#define HNS_ROCE_DWQE_PAGE_SIZE 65536
|
||
|
|
+
|
||
|
|
#define HNS_ROCE_ADDRESS_MASK 0xFFFFFFFF
|
||
|
|
#define HNS_ROCE_ADDRESS_SHIFT 32
|
||
|
|
|
||
|
|
@@ -279,13 +281,14 @@ struct hns_roce_qp {
|
||
|
|
struct hns_roce_sge_ex ex_sge;
|
||
|
|
unsigned int next_sge;
|
||
|
|
int port_num;
|
||
|
|
- int sl;
|
||
|
|
+ uint8_t sl;
|
||
|
|
unsigned int qkey;
|
||
|
|
enum ibv_mtu path_mtu;
|
||
|
|
|
||
|
|
struct hns_roce_rinl_buf rq_rinl_buf;
|
||
|
|
unsigned long flags;
|
||
|
|
int refcnt; /* specially used for XRC */
|
||
|
|
+ void *dwqe_page;
|
||
|
|
};
|
||
|
|
|
||
|
|
struct hns_roce_av {
|
||
|
|
diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c
|
||
|
|
index 1169b64b..f102fd61 100644
|
||
|
|
--- a/providers/hns/hns_roce_u_hw_v2.c
|
||
|
|
+++ b/providers/hns/hns_roce_u_hw_v2.c
|
||
|
|
@@ -33,6 +33,7 @@
|
||
|
|
#define _GNU_SOURCE
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <string.h>
|
||
|
|
+#include <sys/mman.h>
|
||
|
|
#include "hns_roce_u.h"
|
||
|
|
#include "hns_roce_u_db.h"
|
||
|
|
#include "hns_roce_u_hw_v2.h"
|
||
|
|
@@ -297,20 +298,40 @@ static void hns_roce_update_rq_db(struct hns_roce_context *ctx,
|
||
|
|
}
|
||
|
|
|
||
|
|
static void hns_roce_update_sq_db(struct hns_roce_context *ctx,
|
||
|
|
- unsigned int qpn, unsigned int sl,
|
||
|
|
- unsigned int sq_head)
|
||
|
|
+ struct hns_roce_qp *qp)
|
||
|
|
{
|
||
|
|
struct hns_roce_db sq_db = {};
|
||
|
|
|
||
|
|
- sq_db.byte_4 = htole32(qpn);
|
||
|
|
+ sq_db.byte_4 = htole32(qp->verbs_qp.qp.qp_num);
|
||
|
|
roce_set_field(sq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S,
|
||
|
|
HNS_ROCE_V2_SQ_DB);
|
||
|
|
- sq_db.parameter = htole32(sq_head);
|
||
|
|
- roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, sl);
|
||
|
|
|
||
|
|
+ sq_db.parameter = htole32(qp->sq.head);
|
||
|
|
+ roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, qp->sl);
|
||
|
|
hns_roce_write64(ctx->uar + ROCEE_VF_DB_CFG0_OFFSET, (__le32 *)&sq_db);
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void hns_roce_write512(uint64_t *dest, uint64_t *val)
|
||
|
|
+{
|
||
|
|
+ mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
|
||
|
|
+{
|
||
|
|
+ struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe;
|
||
|
|
+
|
||
|
|
+ /* All kinds of DirectWQE have the same header field layout */
|
||
|
|
+ roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_FLAG_S, 1);
|
||
|
|
+ roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_L_M,
|
||
|
|
+ RC_SQ_WQE_BYTE_4_DB_SL_L_S, qp->sl);
|
||
|
|
+ roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_H_M,
|
||
|
|
+ RC_SQ_WQE_BYTE_4_DB_SL_H_S, qp->sl >> HNS_ROCE_SL_SHIFT);
|
||
|
|
+ roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_WQE_INDEX_M,
|
||
|
|
+ RC_SQ_WQE_BYTE_4_WQE_INDEX_S, qp->sq.head);
|
||
|
|
+
|
||
|
|
+ hns_roce_write512(qp->dwqe_page, wqe);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
static void hns_roce_v2_update_cq_cons_index(struct hns_roce_context *ctx,
|
||
|
|
struct hns_roce_cq *cq)
|
||
|
|
{
|
||
|
|
@@ -339,8 +360,7 @@ static struct hns_roce_qp *hns_roce_v2_find_qp(struct hns_roce_context *ctx,
|
||
|
|
return NULL;
|
||
|
|
}
|
||
|
|
|
||
|
|
-static void hns_roce_v2_clear_qp(struct hns_roce_context *ctx,
|
||
|
|
- struct hns_roce_qp *qp)
|
||
|
|
+void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp)
|
||
|
|
{
|
||
|
|
uint32_t qpn = qp->verbs_qp.qp.qp_num;
|
||
|
|
uint32_t tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
|
||
|
|
@@ -1196,6 +1216,7 @@ int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr,
|
||
|
|
break;
|
||
|
|
case IBV_QPT_UD:
|
||
|
|
ret = set_ud_wqe(wqe, qp, wr, nreq, &sge_info);
|
||
|
|
+ qp->sl = to_hr_ah(wr->wr.ud.ah)->av.sl;
|
||
|
|
break;
|
||
|
|
default:
|
||
|
|
ret = EINVAL;
|
||
|
|
@@ -1214,7 +1235,10 @@ out:
|
||
|
|
|
||
|
|
udma_to_device_barrier();
|
||
|
|
|
||
|
|
- hns_roce_update_sq_db(ctx, ibvqp->qp_num, qp->sl, qp->sq.head);
|
||
|
|
+ if (nreq == 1 && (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE))
|
||
|
|
+ hns_roce_write_dwqe(qp, wqe);
|
||
|
|
+ else
|
||
|
|
+ hns_roce_update_sq_db(ctx, qp);
|
||
|
|
|
||
|
|
if (qp->flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
|
||
|
|
*(qp->sdb) = qp->sq.head & 0xffff;
|
||
|
|
@@ -1506,6 +1530,9 @@ static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp)
|
||
|
|
if (ret)
|
||
|
|
return ret;
|
||
|
|
|
||
|
|
+ if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE)
|
||
|
|
+ munmap(qp->dwqe_page, HNS_ROCE_DWQE_PAGE_SIZE);
|
||
|
|
+
|
||
|
|
hns_roce_v2_clear_qp(ctx, qp);
|
||
|
|
|
||
|
|
hns_roce_lock_cqs(ibqp);
|
||
|
|
diff --git a/providers/hns/hns_roce_u_hw_v2.h b/providers/hns/hns_roce_u_hw_v2.h
|
||
|
|
index c13d82e3..af72cd70 100644
|
||
|
|
--- a/providers/hns/hns_roce_u_hw_v2.h
|
||
|
|
+++ b/providers/hns/hns_roce_u_hw_v2.h
|
||
|
|
@@ -40,6 +40,8 @@
|
||
|
|
|
||
|
|
#define HNS_ROCE_CMDSN_MASK 0x3
|
||
|
|
|
||
|
|
+#define HNS_ROCE_SL_SHIFT 2
|
||
|
|
+
|
||
|
|
/* V2 REG DEFINITION */
|
||
|
|
#define ROCEE_VF_DB_CFG0_OFFSET 0x0230
|
||
|
|
|
||
|
|
@@ -133,6 +135,8 @@ struct hns_roce_db {
|
||
|
|
#define DB_BYTE_4_CMD_S 24
|
||
|
|
#define DB_BYTE_4_CMD_M GENMASK(27, 24)
|
||
|
|
|
||
|
|
+#define DB_BYTE_4_FLAG_S 31
|
||
|
|
+
|
||
|
|
#define DB_PARAM_SRQ_PRODUCER_COUNTER_S 0
|
||
|
|
#define DB_PARAM_SRQ_PRODUCER_COUNTER_M GENMASK(15, 0)
|
||
|
|
|
||
|
|
@@ -216,8 +220,16 @@ struct hns_roce_rc_sq_wqe {
|
||
|
|
};
|
||
|
|
|
||
|
|
#define RC_SQ_WQE_BYTE_4_OPCODE_S 0
|
||
|
|
-#define RC_SQ_WQE_BYTE_4_OPCODE_M \
|
||
|
|
- (((1UL << 5) - 1) << RC_SQ_WQE_BYTE_4_OPCODE_S)
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_OPCODE_M GENMASK(4, 0)
|
||
|
|
+
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_DB_SL_L_S 5
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_DB_SL_L_M GENMASK(6, 5)
|
||
|
|
+
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_DB_SL_H_S 13
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_DB_SL_H_M GENMASK(14, 13)
|
||
|
|
+
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_WQE_INDEX_S 15
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_WQE_INDEX_M GENMASK(30, 15)
|
||
|
|
|
||
|
|
#define RC_SQ_WQE_BYTE_4_OWNER_S 7
|
||
|
|
|
||
|
|
@@ -239,6 +251,8 @@ struct hns_roce_rc_sq_wqe {
|
||
|
|
|
||
|
|
#define RC_SQ_WQE_BYTE_4_RDMA_WRITE_S 22
|
||
|
|
|
||
|
|
+#define RC_SQ_WQE_BYTE_4_FLAG_S 31
|
||
|
|
+
|
||
|
|
#define RC_SQ_WQE_BYTE_16_XRC_SRQN_S 0
|
||
|
|
#define RC_SQ_WQE_BYTE_16_XRC_SRQN_M \
|
||
|
|
(((1UL << 24) - 1) << RC_SQ_WQE_BYTE_16_XRC_SRQN_S)
|
||
|
|
@@ -311,23 +325,12 @@ struct hns_roce_ud_sq_wqe {
|
||
|
|
#define UD_SQ_WQE_OPCODE_S 0
|
||
|
|
#define UD_SQ_WQE_OPCODE_M GENMASK(4, 0)
|
||
|
|
|
||
|
|
-#define UD_SQ_WQE_DB_SL_L_S 5
|
||
|
|
-#define UD_SQ_WQE_DB_SL_L_M GENMASK(6, 5)
|
||
|
|
-
|
||
|
|
-#define UD_SQ_WQE_DB_SL_H_S 13
|
||
|
|
-#define UD_SQ_WQE_DB_SL_H_M GENMASK(14, 13)
|
||
|
|
-
|
||
|
|
-#define UD_SQ_WQE_INDEX_S 15
|
||
|
|
-#define UD_SQ_WQE_INDEX_M GENMASK(30, 15)
|
||
|
|
-
|
||
|
|
#define UD_SQ_WQE_OWNER_S 7
|
||
|
|
|
||
|
|
#define UD_SQ_WQE_CQE_S 8
|
||
|
|
|
||
|
|
#define UD_SQ_WQE_SE_S 11
|
||
|
|
|
||
|
|
-#define UD_SQ_WQE_FLAG_S 31
|
||
|
|
-
|
||
|
|
#define UD_SQ_WQE_PD_S 0
|
||
|
|
#define UD_SQ_WQE_PD_M GENMASK(23, 0)
|
||
|
|
|
||
|
|
@@ -376,4 +379,6 @@ struct hns_roce_ud_sq_wqe {
|
||
|
|
|
||
|
|
#define MAX_SERVICE_LEVEL 0x7
|
||
|
|
|
||
|
|
+void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp);
|
||
|
|
+
|
||
|
|
#endif /* _HNS_ROCE_U_HW_V2_H */
|
||
|
|
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
|
||
|
|
index 125858d2..fc902815 100644
|
||
|
|
--- a/providers/hns/hns_roce_u_verbs.c
|
||
|
|
+++ b/providers/hns/hns_roce_u_verbs.c
|
||
|
|
@@ -1076,7 +1076,8 @@ static int hns_roce_store_qp(struct hns_roce_context *ctx,
|
||
|
|
|
||
|
|
static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
|
||
|
|
struct hns_roce_qp *qp,
|
||
|
|
- struct hns_roce_context *ctx)
|
||
|
|
+ struct hns_roce_context *ctx,
|
||
|
|
+ uint64_t *dwqe_mmap_key)
|
||
|
|
{
|
||
|
|
struct hns_roce_create_qp_ex_resp resp_ex = {};
|
||
|
|
struct hns_roce_create_qp_ex cmd_ex = {};
|
||
|
|
@@ -1093,6 +1094,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
|
||
|
|
&resp_ex.ibv_resp, sizeof(resp_ex));
|
||
|
|
|
||
|
|
qp->flags = resp_ex.drv_payload.cap_flags;
|
||
|
|
+ *dwqe_mmap_key = resp_ex.drv_payload.dwqe_mmap_key;
|
||
|
|
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
@@ -1144,11 +1146,23 @@ static int hns_roce_alloc_qp_buf(struct ibv_qp_init_attr_ex *attr,
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
+static int mmap_dwqe(struct ibv_context *ibv_ctx, struct hns_roce_qp *qp,
|
||
|
|
+ uint64_t dwqe_mmap_key)
|
||
|
|
+{
|
||
|
|
+ qp->dwqe_page = mmap(NULL, HNS_ROCE_DWQE_PAGE_SIZE, PROT_WRITE,
|
||
|
|
+ MAP_SHARED, ibv_ctx->cmd_fd, dwqe_mmap_key);
|
||
|
|
+ if (qp->dwqe_page == MAP_FAILED)
|
||
|
|
+ return -EINVAL;
|
||
|
|
+
|
||
|
|
+ return 0;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
|
||
|
|
struct ibv_qp_init_attr_ex *attr)
|
||
|
|
{
|
||
|
|
struct hns_roce_context *context = to_hr_ctx(ibv_ctx);
|
||
|
|
struct hns_roce_qp *qp;
|
||
|
|
+ uint64_t dwqe_mmap_key;
|
||
|
|
int ret;
|
||
|
|
|
||
|
|
ret = verify_qp_create_attr(context, attr);
|
||
|
|
@@ -1167,7 +1181,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
|
||
|
|
if (ret)
|
||
|
|
goto err_buf;
|
||
|
|
|
||
|
|
- ret = qp_exec_create_cmd(attr, qp, context);
|
||
|
|
+ ret = qp_exec_create_cmd(attr, qp, context, &dwqe_mmap_key);
|
||
|
|
if (ret)
|
||
|
|
goto err_cmd;
|
||
|
|
|
||
|
|
@@ -1175,10 +1189,18 @@ static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
|
||
|
|
if (ret)
|
||
|
|
goto err_store;
|
||
|
|
|
||
|
|
+ if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE) {
|
||
|
|
+ ret = mmap_dwqe(ibv_ctx, qp, dwqe_mmap_key);
|
||
|
|
+ if (ret)
|
||
|
|
+ goto err_dwqe;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
qp_setup_config(attr, qp, context);
|
||
|
|
|
||
|
|
return &qp->verbs_qp.qp;
|
||
|
|
|
||
|
|
+err_dwqe:
|
||
|
|
+ hns_roce_v2_clear_qp(context, qp);
|
||
|
|
err_store:
|
||
|
|
ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
|
||
|
|
err_cmd:
|
||
|
|
diff --git a/util/mmio.h b/util/mmio.h
|
||
|
|
index 101af9dd..01d1455e 100644
|
||
|
|
--- a/util/mmio.h
|
||
|
|
+++ b/util/mmio.h
|
||
|
|
@@ -210,8 +210,33 @@ static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
|
||
|
|
{
|
||
|
|
s390_mmio_write(dest, src, bytecnt);
|
||
|
|
}
|
||
|
|
-#else
|
||
|
|
|
||
|
|
+#elif defined(__aarch64__) || defined(__arm__)
|
||
|
|
+#include <arm_neon.h>
|
||
|
|
+
|
||
|
|
+static inline void _mmio_memcpy_x64_64b(void *dest, const void *src)
|
||
|
|
+{
|
||
|
|
+ vst4q_u64(dest, vld4q_u64(src));
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+static inline void _mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
|
||
|
|
+{
|
||
|
|
+ do {
|
||
|
|
+ _mmio_memcpy_x64_64b(dest, src);
|
||
|
|
+ bytecnt -= sizeof(uint64x2x4_t);
|
||
|
|
+ src += sizeof(uint64x2x4_t);
|
||
|
|
+ } while (bytecnt > 0);
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+#define mmio_memcpy_x64(dest, src, bytecount) \
|
||
|
|
+ ({ \
|
||
|
|
+ if (__builtin_constant_p((bytecount) == 64)) \
|
||
|
|
+ _mmio_memcpy_x64_64b((dest), (src)); \
|
||
|
|
+ else \
|
||
|
|
+ _mmio_memcpy_x64((dest), (src), (bytecount)); \
|
||
|
|
+ })
|
||
|
|
+
|
||
|
|
+#else
|
||
|
|
/* Transfer is some multiple of 64 bytes */
|
||
|
|
static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
|
||
|
|
{
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|