187 lines
6.5 KiB
Diff
187 lines
6.5 KiB
Diff
|
|
From b05879f0287aa5b4bd315fea3ef0e0b82238e935 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Chengchang Tang <tangchengchang@huawei.com>
|
||
|
|
Date: Mon, 23 Oct 2023 21:13:03 +0800
|
||
|
|
Subject: [PATCH 11/18] libhns: Support flexible WQE buffer page size
|
||
|
|
|
||
|
|
driver inclusion
|
||
|
|
category: bugfix
|
||
|
|
bugzilla: https://gitee.com/openeuler/kernel/issues/I98HIN
|
||
|
|
|
||
|
|
--------------------------------------------------------------------------
|
||
|
|
|
||
|
|
Currently, driver fixedly allocates 4K pages for user space WQE buffer
|
||
|
|
even in a 64K system. This results in HW reading WQE with a granularity
|
||
|
|
of 4K even in a 64K system. Considering that we support 1024-byte inline,
|
||
|
|
in the scenario of using SQ inline, HW will switch pages every 4 WQEs.
|
||
|
|
This will introduce a delay of about 400ns, which is an average delay of
|
||
|
|
100ns per packet.
|
||
|
|
|
||
|
|
In order to improve performance, we allow user-mode drivers to use a
|
||
|
|
larger page size to allocate WQE buffers, thereby reducing the latency
|
||
|
|
introduced by HW page switching. User-mode drivers will be allowed to
|
||
|
|
allocate WQE buffers between 4K to system page size. During
|
||
|
|
ibv_create_qp(), the driver will dynamically select the appropriate page
|
||
|
|
size based on ibv_qp_cap, thus reducing memory consumption while improving
|
||
|
|
performance.
|
||
|
|
|
||
|
|
This feature needs to be used in conjunction with the kernel-mode driver.
|
||
|
|
In order to ensure forward compatibility, if the kernel-mode driver does
|
||
|
|
not support this feature, the user-mode driver will continue to use a
|
||
|
|
fixed 4K pagesize to allocate WQE buffer.
|
||
|
|
|
||
|
|
Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
|
||
|
|
---
|
||
|
|
kernel-headers/rdma/hns-abi.h | 5 +++-
|
||
|
|
providers/hns/hns_roce_u.h | 1 +
|
||
|
|
providers/hns/hns_roce_u_verbs.c | 51 ++++++++++++++++++++++++++++----
|
||
|
|
3 files changed, 50 insertions(+), 7 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h
|
||
|
|
index 39ed8a4..f33d876 100644
|
||
|
|
--- a/kernel-headers/rdma/hns-abi.h
|
||
|
|
+++ b/kernel-headers/rdma/hns-abi.h
|
||
|
|
@@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
|
||
|
|
__u8 log_sq_bb_count;
|
||
|
|
__u8 log_sq_stride;
|
||
|
|
__u8 sq_no_prefetch;
|
||
|
|
- __u8 reserved[5];
|
||
|
|
+ __u8 pageshift;
|
||
|
|
+ __u8 reserved[4];
|
||
|
|
__aligned_u64 sdb_addr;
|
||
|
|
__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
|
||
|
|
__aligned_u64 create_flags;
|
||
|
|
@@ -119,12 +120,14 @@ enum {
|
||
|
|
HNS_ROCE_EXSGE_FLAGS = 1 << 0,
|
||
|
|
HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
|
||
|
|
HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
|
||
|
|
+ HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4,
|
||
|
|
};
|
||
|
|
|
||
|
|
enum {
|
||
|
|
HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0,
|
||
|
|
HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
|
||
|
|
HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
|
||
|
|
+ HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ,
|
||
|
|
};
|
||
|
|
|
||
|
|
struct hns_roce_ib_alloc_ucontext_resp {
|
||
|
|
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
|
||
|
|
index 21a6e28..56851b0 100644
|
||
|
|
--- a/providers/hns/hns_roce_u.h
|
||
|
|
+++ b/providers/hns/hns_roce_u.h
|
||
|
|
@@ -349,6 +349,7 @@ struct hns_roce_qp {
|
||
|
|
uint8_t sl;
|
||
|
|
uint8_t tc_mode;
|
||
|
|
uint8_t priority;
|
||
|
|
+ uint8_t pageshift;
|
||
|
|
unsigned int qkey;
|
||
|
|
enum ibv_mtu path_mtu;
|
||
|
|
|
||
|
|
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
|
||
|
|
index 00e59dc..fc255ed 100644
|
||
|
|
--- a/providers/hns/hns_roce_u_verbs.c
|
||
|
|
+++ b/providers/hns/hns_roce_u_verbs.c
|
||
|
|
@@ -1178,31 +1178,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev,
|
||
|
|
+ struct hns_roce_context *ctx,
|
||
|
|
+ struct hns_roce_qp *qp)
|
||
|
|
+{
|
||
|
|
+ uint32_t ext_sge_size;
|
||
|
|
+ uint32_t sq_size;
|
||
|
|
+ uint32_t rq_size;
|
||
|
|
+ uint8_t pg_shift;
|
||
|
|
+
|
||
|
|
+ if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ)) {
|
||
|
|
+ qp->pageshift = HNS_HW_PAGE_SHIFT;
|
||
|
|
+ return;
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
+ /*
|
||
|
|
+ * The larger the pagesize used, the better the performance, but it
|
||
|
|
+ * may waste more memory. Therefore, we use the least common multiple
|
||
|
|
+ * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size,
|
||
|
|
+ * and ext_sge buffer size as the pagesize. Additionally, since the
|
||
|
|
+ * kernel cannot guarantee the allocation of contiguous memory larger
|
||
|
|
+ * than the system page, the pagesize must be smaller than the system
|
||
|
|
+ * page.
|
||
|
|
+ */
|
||
|
|
+ sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift;
|
||
|
|
+ ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift;
|
||
|
|
+ rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
|
||
|
|
+
|
||
|
|
+ pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0,
|
||
|
|
+ ext_sge_size ? hr_ilog32(ext_sge_size) : 0);
|
||
|
|
+ pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0);
|
||
|
|
+ pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT);
|
||
|
|
+ qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size));
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
static int calc_qp_buff_size(struct hns_roce_device *hr_dev,
|
||
|
|
+ struct hns_roce_context *ctx,
|
||
|
|
struct hns_roce_qp *qp)
|
||
|
|
{
|
||
|
|
struct hns_roce_wq *sq = &qp->sq;
|
||
|
|
struct hns_roce_wq *rq = &qp->rq;
|
||
|
|
+ unsigned int page_size;
|
||
|
|
unsigned int size;
|
||
|
|
|
||
|
|
qp->buf_size = 0;
|
||
|
|
+ get_best_multi_region_pg_shift(hr_dev, ctx, qp);
|
||
|
|
+ page_size = 1 << qp->pageshift;
|
||
|
|
|
||
|
|
/* SQ WQE */
|
||
|
|
sq->offset = 0;
|
||
|
|
- size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift);
|
||
|
|
+ size = align(sq->wqe_cnt << sq->wqe_shift, page_size);
|
||
|
|
qp->buf_size += size;
|
||
|
|
|
||
|
|
/* extend SGE WQE in SQ */
|
||
|
|
qp->ex_sge.offset = qp->buf_size;
|
||
|
|
if (qp->ex_sge.sge_cnt > 0) {
|
||
|
|
- size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt,
|
||
|
|
- qp->ex_sge.sge_shift);
|
||
|
|
+ size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift,
|
||
|
|
+ page_size);
|
||
|
|
qp->buf_size += size;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* RQ WQE */
|
||
|
|
rq->offset = qp->buf_size;
|
||
|
|
- size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift);
|
||
|
|
+ size = align(rq->wqe_cnt << rq->wqe_shift, page_size);
|
||
|
|
qp->buf_size += size;
|
||
|
|
|
||
|
|
if (qp->buf_size < 1)
|
||
|
|
@@ -1227,7 +1265,7 @@ static int qp_alloc_wqe(struct ibv_qp_cap *cap, struct hns_roce_qp *qp,
|
||
|
|
{
|
||
|
|
struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);
|
||
|
|
|
||
|
|
- if (calc_qp_buff_size(hr_dev, qp))
|
||
|
|
+ if (calc_qp_buff_size(hr_dev, ctx, qp))
|
||
|
|
return -EINVAL;
|
||
|
|
|
||
|
|
qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
|
||
|
|
@@ -1245,7 +1283,7 @@ static int qp_alloc_wqe(struct ibv_qp_cap *cap, struct hns_roce_qp *qp,
|
||
|
|
goto err_alloc;
|
||
|
|
}
|
||
|
|
|
||
|
|
- if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, HNS_HW_PAGE_SIZE))
|
||
|
|
+ if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, 1 << qp->pageshift))
|
||
|
|
goto err_alloc;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
@@ -1482,6 +1520,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
|
||
|
|
cmd_ex.buf_addr = (uintptr_t)qp->buf.buf;
|
||
|
|
cmd_ex.log_sq_stride = qp->sq.wqe_shift;
|
||
|
|
cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt);
|
||
|
|
+ cmd_ex.pageshift = qp->pageshift;
|
||
|
|
|
||
|
|
if (hns_attr &&
|
||
|
|
hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CONGEST_TYPE) {
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|