When HW is in resetting stage, we could not poll back all the expected work completions as the HW won't generate cqe anymore. This patch allows driver to compose the expected wc instead of the HW during resetting stage. Once the hardware finished resetting, we can poll cq from hardware again. Signed-off-by: Ran Zhou <zhouran10@h-partners.com> (cherry picked from commit 5494e44cf97e65d858c8f7376c0424a833dc8323)
187 lines
6.5 KiB
Diff
187 lines
6.5 KiB
Diff
From b05879f0287aa5b4bd315fea3ef0e0b82238e935 Mon Sep 17 00:00:00 2001
|
|
From: Chengchang Tang <tangchengchang@huawei.com>
|
|
Date: Mon, 23 Oct 2023 21:13:03 +0800
|
|
Subject: [PATCH 11/18] libhns: Support flexible WQE buffer page size
|
|
|
|
driver inclusion
|
|
category: bugfix
|
|
bugzilla: https://gitee.com/openeuler/kernel/issues/I98HIN
|
|
|
|
--------------------------------------------------------------------------
|
|
|
|
Currently, driver fixedly allocates 4K pages for user space WQE buffer
|
|
even in a 64K system. This results in HW reading WQE with a granularity
|
|
of 4K even in a 64K system. Considering that we support 1024-byte inline,
|
|
in the scenario of using SQ inline, HW will switch pages every 4 WQEs.
|
|
This will introduce a delay of about 400ns, which is an average delay of
|
|
100ns per packet.
|
|
|
|
In order to improve performance, we allow user-mode drivers to use a
|
|
larger page size to allocate WQE buffers, thereby reducing the latency
|
|
introduced by HW page switching. User-mode drivers will be allowed to
|
|
allocate WQE buffers between 4K to system page size. During
|
|
ibv_create_qp(), the driver will dynamically select the appropriate page
|
|
size based on ibv_qp_cap, thus reducing memory consumption while improving
|
|
performance.
|
|
|
|
This feature needs to be used in conjunction with the kernel-mode driver.
|
|
In order to ensure forward compatibility, if the kernel-mode driver does
|
|
not support this feature, the user-mode driver will continue to use a
|
|
fixed 4K pagesize to allocate WQE buffer.
|
|
|
|
Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
|
|
---
|
|
kernel-headers/rdma/hns-abi.h | 5 +++-
|
|
providers/hns/hns_roce_u.h | 1 +
|
|
providers/hns/hns_roce_u_verbs.c | 51 ++++++++++++++++++++++++++++----
|
|
3 files changed, 50 insertions(+), 7 deletions(-)
|
|
|
|
diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h
|
|
index 39ed8a4..f33d876 100644
|
|
--- a/kernel-headers/rdma/hns-abi.h
|
|
+++ b/kernel-headers/rdma/hns-abi.h
|
|
@@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
|
|
__u8 log_sq_bb_count;
|
|
__u8 log_sq_stride;
|
|
__u8 sq_no_prefetch;
|
|
- __u8 reserved[5];
|
|
+ __u8 pageshift;
|
|
+ __u8 reserved[4];
|
|
__aligned_u64 sdb_addr;
|
|
__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
|
|
__aligned_u64 create_flags;
|
|
@@ -119,12 +120,14 @@ enum {
|
|
HNS_ROCE_EXSGE_FLAGS = 1 << 0,
|
|
HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
|
|
HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
|
|
+ HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4,
|
|
};
|
|
|
|
enum {
|
|
HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0,
|
|
HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
|
|
HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
|
|
+ HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ,
|
|
};
|
|
|
|
struct hns_roce_ib_alloc_ucontext_resp {
|
|
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
|
|
index 21a6e28..56851b0 100644
|
|
--- a/providers/hns/hns_roce_u.h
|
|
+++ b/providers/hns/hns_roce_u.h
|
|
@@ -349,6 +349,7 @@ struct hns_roce_qp {
|
|
uint8_t sl;
|
|
uint8_t tc_mode;
|
|
uint8_t priority;
|
|
+ uint8_t pageshift;
|
|
unsigned int qkey;
|
|
enum ibv_mtu path_mtu;
|
|
|
|
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
|
|
index 00e59dc..fc255ed 100644
|
|
--- a/providers/hns/hns_roce_u_verbs.c
|
|
+++ b/providers/hns/hns_roce_u_verbs.c
|
|
@@ -1178,31 +1178,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf)
|
|
}
|
|
}
|
|
|
|
+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev,
|
|
+ struct hns_roce_context *ctx,
|
|
+ struct hns_roce_qp *qp)
|
|
+{
|
|
+ uint32_t ext_sge_size;
|
|
+ uint32_t sq_size;
|
|
+ uint32_t rq_size;
|
|
+ uint8_t pg_shift;
|
|
+
|
|
+ if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ)) {
|
|
+ qp->pageshift = HNS_HW_PAGE_SHIFT;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * The larger the pagesize used, the better the performance, but it
|
|
+ * may waste more memory. Therefore, we use the least common multiple
|
|
+ * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size,
|
|
+ * and ext_sge buffer size as the pagesize. Additionally, since the
|
|
+ * kernel cannot guarantee the allocation of contiguous memory larger
|
|
+ * than the system page, the pagesize must be smaller than the system
|
|
+ * page.
|
|
+ */
|
|
+ sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift;
|
|
+ ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift;
|
|
+ rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
|
|
+
|
|
+ pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0,
|
|
+ ext_sge_size ? hr_ilog32(ext_sge_size) : 0);
|
|
+ pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0);
|
|
+ pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT);
|
|
+ qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size));
|
|
+}
|
|
+
|
|
static int calc_qp_buff_size(struct hns_roce_device *hr_dev,
|
|
+ struct hns_roce_context *ctx,
|
|
struct hns_roce_qp *qp)
|
|
{
|
|
struct hns_roce_wq *sq = &qp->sq;
|
|
struct hns_roce_wq *rq = &qp->rq;
|
|
+ unsigned int page_size;
|
|
unsigned int size;
|
|
|
|
qp->buf_size = 0;
|
|
+ get_best_multi_region_pg_shift(hr_dev, ctx, qp);
|
|
+ page_size = 1 << qp->pageshift;
|
|
|
|
/* SQ WQE */
|
|
sq->offset = 0;
|
|
- size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift);
|
|
+ size = align(sq->wqe_cnt << sq->wqe_shift, page_size);
|
|
qp->buf_size += size;
|
|
|
|
/* extend SGE WQE in SQ */
|
|
qp->ex_sge.offset = qp->buf_size;
|
|
if (qp->ex_sge.sge_cnt > 0) {
|
|
- size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt,
|
|
- qp->ex_sge.sge_shift);
|
|
+ size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift,
|
|
+ page_size);
|
|
qp->buf_size += size;
|
|
}
|
|
|
|
/* RQ WQE */
|
|
rq->offset = qp->buf_size;
|
|
- size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift);
|
|
+ size = align(rq->wqe_cnt << rq->wqe_shift, page_size);
|
|
qp->buf_size += size;
|
|
|
|
if (qp->buf_size < 1)
|
|
@@ -1227,7 +1265,7 @@ static int qp_alloc_wqe(struct ibv_qp_cap *cap, struct hns_roce_qp *qp,
|
|
{
|
|
struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);
|
|
|
|
- if (calc_qp_buff_size(hr_dev, qp))
|
|
+ if (calc_qp_buff_size(hr_dev, ctx, qp))
|
|
return -EINVAL;
|
|
|
|
qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
|
|
@@ -1245,7 +1283,7 @@ static int qp_alloc_wqe(struct ibv_qp_cap *cap, struct hns_roce_qp *qp,
|
|
goto err_alloc;
|
|
}
|
|
|
|
- if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, HNS_HW_PAGE_SIZE))
|
|
+ if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, 1 << qp->pageshift))
|
|
goto err_alloc;
|
|
|
|
return 0;
|
|
@@ -1482,6 +1520,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
|
|
cmd_ex.buf_addr = (uintptr_t)qp->buf.buf;
|
|
cmd_ex.log_sq_stride = qp->sq.wqe_shift;
|
|
cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt);
|
|
+ cmd_ex.pageshift = qp->pageshift;
|
|
|
|
if (hns_attr &&
|
|
hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CONGEST_TYPE) {
|
|
--
|
|
2.33.0
|
|
|