Support flexible WQE buffer page size
In order to improve performance, we allow user-mode drivers to use a larger page size to allocate WQE buffers, thereby reducing the latency introduced by HW page switching. User-mode drivers will be allowed to allocate WQE buffers between 4K to system page size. During ibv_create_qp(), the driver will dynamically select the appropriate page size based on ibv_qp_cap, thus reducing memory consumption while improving performance. Signed-off-by: Ran Zhou <zhouran10@h-partners.com> (cherry picked from commit 1a21f45d978a8c469d128838bfd6ef5a72d335e8)
This commit is contained in:
parent
22770c260d
commit
98e759e379
231
0058-libhns-Support-flexible-WQE-buffer-page-size.patch
Normal file
231
0058-libhns-Support-flexible-WQE-buffer-page-size.patch
Normal file
@ -0,0 +1,231 @@
|
||||
From d628c51d25b972a7d26e53ea400b3a0679d51f91 Mon Sep 17 00:00:00 2001
|
||||
From: Chengchang Tang <tangchengchang@huawei.com>
|
||||
Date: Mon, 23 Oct 2023 21:13:03 +0800
|
||||
Subject: [PATCH] libhns: Support flexible WQE buffer page size
|
||||
|
||||
driver inclusion
|
||||
category: feature
|
||||
bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM
|
||||
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
Currently, driver fixedly allocates 4K pages for user space WQE buffer
|
||||
even in a 64K system. This results in HW reading WQE with a granularity
|
||||
of 4K even in a 64K system. Considering that we support 1024-byte inline,
|
||||
in the scenario of using SQ inline, HW will switch pages every 4 WQEs.
|
||||
This will introduce a delay of about 400ns, which is an average delay of
|
||||
100ns per packet.
|
||||
|
||||
In order to improve performance, we allow user-mode drivers to use a
|
||||
larger page size to allocate WQE buffers, thereby reducing the latency
|
||||
introduced by HW page switching. User-mode drivers will be allowed to
|
||||
allocate WQE buffers between 4K to system page size. During
|
||||
ibv_create_qp(), the driver will dynamically select the appropriate page
|
||||
size based on ibv_qp_cap, thus reducing memory consumption while improving
|
||||
performance.
|
||||
|
||||
This feature needs to be used in conjunction with the kernel-mode driver.
|
||||
In order to ensure forward compatibility, if the kernel-mode driver does
|
||||
not support this feature, the user-mode driver will continue to use a
|
||||
fixed 4K pagesize to allocate WQE buffer.
|
||||
|
||||
Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
|
||||
---
|
||||
kernel-headers/rdma/hns-abi.h | 5 ++-
|
||||
providers/hns/hns_roce_u.c | 2 +-
|
||||
providers/hns/hns_roce_u.h | 1 +
|
||||
providers/hns/hns_roce_u_verbs.c | 65 ++++++++++++++++++++++++++------
|
||||
4 files changed, 59 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h
|
||||
index cab941f..157dc9d 100644
|
||||
--- a/kernel-headers/rdma/hns-abi.h
|
||||
+++ b/kernel-headers/rdma/hns-abi.h
|
||||
@@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp {
|
||||
__u8 log_sq_bb_count;
|
||||
__u8 log_sq_stride;
|
||||
__u8 sq_no_prefetch;
|
||||
- __u8 reserved[5];
|
||||
+ __u8 reserved[4];
|
||||
+ __u8 pageshift;
|
||||
__aligned_u64 sdb_addr;
|
||||
__aligned_u64 comp_mask;
|
||||
__aligned_u64 create_flags;
|
||||
@@ -122,6 +123,7 @@ enum {
|
||||
HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
|
||||
HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
|
||||
HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3,
|
||||
+ HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4,
|
||||
};
|
||||
|
||||
enum {
|
||||
@@ -129,6 +131,7 @@ enum {
|
||||
HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
|
||||
HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
|
||||
HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA,
|
||||
+ HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ,
|
||||
};
|
||||
|
||||
struct hns_roce_ib_alloc_ucontext_resp {
|
||||
diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c
|
||||
index 0660081..02ad880 100644
|
||||
--- a/providers/hns/hns_roce_u.c
|
||||
+++ b/providers/hns/hns_roce_u.c
|
||||
@@ -267,7 +267,7 @@ static void ucontext_set_cmd(struct hns_roce_alloc_ucontext *cmd,
|
||||
struct hnsdv_context_attr *attr)
|
||||
{
|
||||
cmd->config |= HNS_ROCE_EXSGE_FLAGS | HNS_ROCE_RQ_INLINE_FLAGS |
|
||||
- HNS_ROCE_CQE_INLINE_FLAGS;
|
||||
+ HNS_ROCE_CQE_INLINE_FLAGS | HNS_ROCE_UCTX_DYN_QP_PGSZ;
|
||||
|
||||
if (!attr || !(attr->flags & HNSDV_CONTEXT_FLAGS_DCA))
|
||||
return;
|
||||
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
|
||||
index 5501d8e..ae9ae51 100644
|
||||
--- a/providers/hns/hns_roce_u.h
|
||||
+++ b/providers/hns/hns_roce_u.h
|
||||
@@ -409,6 +409,7 @@ struct hns_roce_qp {
|
||||
uint8_t sl;
|
||||
uint8_t tc_mode;
|
||||
uint8_t priority;
|
||||
+ uint8_t pageshift;
|
||||
unsigned int qkey;
|
||||
enum ibv_mtu path_mtu;
|
||||
|
||||
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
|
||||
index 7b58dd0..f76341c 100644
|
||||
--- a/providers/hns/hns_roce_u_verbs.c
|
||||
+++ b/providers/hns/hns_roce_u_verbs.c
|
||||
@@ -1327,31 +1327,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf)
|
||||
}
|
||||
}
|
||||
|
||||
+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev,
|
||||
+ struct hns_roce_context *ctx,
|
||||
+ struct hns_roce_qp *qp, bool dca_en)
|
||||
+{
|
||||
+ uint32_t ext_sge_size;
|
||||
+ uint32_t sq_size;
|
||||
+ uint32_t rq_size;
|
||||
+ uint8_t pg_shift;
|
||||
+
|
||||
+ if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ) || dca_en) {
|
||||
+ qp->pageshift = HNS_HW_PAGE_SHIFT;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * The larger the pagesize used, the better the performance, but it
|
||||
+ * may waste more memory. Therefore, we use the least common multiple
|
||||
+ * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size,
|
||||
+ * and ext_sge buffer size as the pagesize. Additionally, since the
|
||||
+ * kernel cannot guarantee the allocation of contiguous memory larger
|
||||
+ * than the system page, the pagesize must be smaller than the system
|
||||
+ * page.
|
||||
+ */
|
||||
+ sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift;
|
||||
+ ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift;
|
||||
+ rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
|
||||
+
|
||||
+ pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0,
|
||||
+ ext_sge_size ? hr_ilog32(ext_sge_size) : 0);
|
||||
+ pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0);
|
||||
+ pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT);
|
||||
+ qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size));
|
||||
+}
|
||||
+
|
||||
static int calc_qp_buff_size(struct hns_roce_device *hr_dev,
|
||||
- struct hns_roce_qp *qp)
|
||||
+ struct hns_roce_context *ctx,
|
||||
+ struct hns_roce_qp *qp, bool dca_en)
|
||||
{
|
||||
struct hns_roce_wq *sq = &qp->sq;
|
||||
struct hns_roce_wq *rq = &qp->rq;
|
||||
+ unsigned int page_size;
|
||||
unsigned int size;
|
||||
|
||||
qp->buf_size = 0;
|
||||
+ get_best_multi_region_pg_shift(hr_dev, ctx, qp, dca_en);
|
||||
+ page_size = 1 << qp->pageshift;
|
||||
|
||||
/* SQ WQE */
|
||||
sq->offset = 0;
|
||||
- size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift);
|
||||
+ size = align(sq->wqe_cnt << sq->wqe_shift, page_size);
|
||||
qp->buf_size += size;
|
||||
|
||||
/* extend SGE WQE in SQ */
|
||||
qp->ex_sge.offset = qp->buf_size;
|
||||
if (qp->ex_sge.sge_cnt > 0) {
|
||||
- size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt,
|
||||
- qp->ex_sge.sge_shift);
|
||||
+ size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift,
|
||||
+ page_size);
|
||||
qp->buf_size += size;
|
||||
}
|
||||
|
||||
/* RQ WQE */
|
||||
rq->offset = qp->buf_size;
|
||||
- size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift);
|
||||
+ size = align(rq->wqe_cnt << rq->wqe_shift, page_size);
|
||||
qp->buf_size += size;
|
||||
|
||||
if (qp->buf_size < 1)
|
||||
@@ -1375,7 +1413,7 @@ static inline bool check_qp_support_dca(struct hns_roce_dca_ctx *dca_ctx,
|
||||
if (hns_attr &&
|
||||
(hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) &&
|
||||
(hns_attr->create_flags & HNSDV_QP_CREATE_ENABLE_DCA_MODE))
|
||||
- return true;
|
||||
+ return dca_ctx->max_size > 0;
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -1396,9 +1434,12 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,
|
||||
struct hns_roce_qp *qp, struct hns_roce_context *ctx)
|
||||
{
|
||||
struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);
|
||||
+ bool dca_en = check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr);
|
||||
+ int ret;
|
||||
|
||||
- if (calc_qp_buff_size(hr_dev, qp))
|
||||
- return -EINVAL;
|
||||
+ ret = calc_qp_buff_size(hr_dev, ctx, qp, dca_en);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
|
||||
qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
|
||||
if (!qp->sq.wrid)
|
||||
@@ -1416,19 +1457,18 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,
|
||||
goto err_alloc;
|
||||
}
|
||||
|
||||
- if (check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr) &&
|
||||
- ctx->dca_ctx.max_size > 0) {
|
||||
+ if (dca_en) {
|
||||
/* when DCA is enabled, use a buffer list to store page addr */
|
||||
qp->buf.buf = NULL;
|
||||
qp->dca_wqe.max_cnt = hr_hw_page_count(qp->buf_size);
|
||||
- qp->dca_wqe.shift = HNS_HW_PAGE_SHIFT;
|
||||
+ qp->dca_wqe.shift = qp->pageshift;
|
||||
qp->dca_wqe.bufs = calloc(qp->dca_wqe.max_cnt, sizeof(void *));
|
||||
if (!qp->dca_wqe.bufs)
|
||||
goto err_alloc;
|
||||
verbs_debug(&ctx->ibv_ctx, "alloc DCA buf.\n");
|
||||
} else {
|
||||
if (hns_roce_alloc_buf(&qp->buf, qp->buf_size,
|
||||
- HNS_HW_PAGE_SIZE))
|
||||
+ 1 << qp->pageshift))
|
||||
goto err_alloc;
|
||||
}
|
||||
|
||||
@@ -1642,6 +1682,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
|
||||
cmd_ex.buf_addr = (uintptr_t)qp->buf.buf;
|
||||
cmd_ex.log_sq_stride = qp->sq.wqe_shift;
|
||||
cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt);
|
||||
+ cmd_ex.pageshift = qp->pageshift;
|
||||
|
||||
if (cmd_flag->congest_type_flags) {
|
||||
cmd_ex.comp_mask |= HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE;
|
||||
--
|
||||
2.25.1
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: rdma-core
|
||||
Version: 41.0
|
||||
Release: 15
|
||||
Release: 16
|
||||
Summary: RDMA core userspace libraries and daemons
|
||||
License: GPLv2 or BSD
|
||||
Url: https://github.com/linux-rdma/rdma-core
|
||||
@ -63,6 +63,7 @@ Patch53: 0054-libhns-return-error-when-post-send-in-reset-state.patch
|
||||
Patch54: 0055-libhns-separate-the-initialization-steps-of-lock.patch
|
||||
Patch55: 0056-libhns-assign-doorbell-to-zero-when-allocate-it.patch
|
||||
patch56: 0057-libhns-Fix-missing-reset-notification.patch
|
||||
patch57: 0058-libhns-Support-flexible-WQE-buffer-page-size.patch
|
||||
|
||||
BuildRequires: binutils cmake >= 2.8.11 gcc libudev-devel pkgconfig pkgconfig(libnl-3.0)
|
||||
BuildRequires: pkgconfig(libnl-route-3.0) valgrind-devel systemd systemd-devel
|
||||
@ -310,6 +311,12 @@ fi
|
||||
%{_mandir}/*
|
||||
|
||||
%changelog
|
||||
* Tue Oct 24 2023 Ran Zhou <zhouran10@h-partners.com> - 41.0-16
|
||||
- Type: requirement
|
||||
- ID: NA
|
||||
- SUG: NA
|
||||
- DESC: Support flexible WQE buffer page size
|
||||
|
||||
* Tue Sep 26 2023 Juan Zhou <zhoujuan51@h-partners.com> - 41.0-15
|
||||
- Type: requirement
|
||||
- ID: NA
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user