rdma-core/0058-libhns-Support-flexible-WQE-buffer-page-size.patch

From d628c51d25b972a7d26e53ea400b3a0679d51f91 Mon Sep 17 00:00:00 2001
From: Chengchang Tang <tangchengchang@huawei.com>
Date: Mon, 23 Oct 2023 21:13:03 +0800
Subject: [PATCH] libhns: Support flexible WQE buffer page size

driver inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM

--------------------------------------------------------------------------

Currently, driver fixedly allocates 4K pages for user space WQE buffer
even in a 64K system. This results in HW reading WQE with a granularity
of 4K even in a 64K system. Considering that we support 1024-byte inline,
in the scenario of using SQ inline, HW will switch pages every 4 WQEs.
This will introduce a delay of about 400ns, which is an average delay of
100ns per packet.

In order to improve performance, we allow user-mode drivers to use a
larger page size to allocate WQE buffers, thereby reducing the latency
introduced by HW page switching. User-mode drivers will be allowed to
allocate WQE buffers between 4K to system page size. During
ibv_create_qp(), the driver will dynamically select the appropriate page
size based on ibv_qp_cap, thus reducing memory consumption while improving
performance.

This feature needs to be used in conjunction with the kernel-mode driver.
In order to ensure forward compatibility, if the kernel-mode driver does
not support this feature, the user-mode driver will continue to use a
fixed 4K pagesize to allocate WQE buffer.

Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
---
 kernel-headers/rdma/hns-abi.h    |  5 ++-
 providers/hns/hns_roce_u.c       |  2 +-
 providers/hns/hns_roce_u.h       |  1 +
 providers/hns/hns_roce_u_verbs.c | 65 ++++++++++++++++++++++++++------
 4 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h
index cab941f..157dc9d 100644
--- a/kernel-headers/rdma/hns-abi.h
+++ b/kernel-headers/rdma/hns-abi.h
@@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp {
 	__u8    log_sq_bb_count;
 	__u8    log_sq_stride;
 	__u8    sq_no_prefetch;
-	__u8    reserved[5];
+	__u8    reserved[4];
+	__u8    pageshift;
 	__aligned_u64 sdb_addr;
 	__aligned_u64 comp_mask;
 	__aligned_u64 create_flags;
@@ -122,6 +123,7 @@ enum {
 	HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
 	HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
 	HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3,
+	HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4,
 };
 
 enum {
@@ -129,6 +131,7 @@ enum {
 	HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
 	HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
 	HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA,
+	HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ,
 };
 
 struct hns_roce_ib_alloc_ucontext_resp {
diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c
index 0660081..02ad880 100644
--- a/providers/hns/hns_roce_u.c
+++ b/providers/hns/hns_roce_u.c
@@ -267,7 +267,7 @@ static void ucontext_set_cmd(struct hns_roce_alloc_ucontext *cmd,
 			     struct hnsdv_context_attr *attr)
 {
 	cmd->config |= HNS_ROCE_EXSGE_FLAGS | HNS_ROCE_RQ_INLINE_FLAGS |
-		       HNS_ROCE_CQE_INLINE_FLAGS;
+		       HNS_ROCE_CQE_INLINE_FLAGS | HNS_ROCE_UCTX_DYN_QP_PGSZ;
 
 	if (!attr || !(attr->flags & HNSDV_CONTEXT_FLAGS_DCA))
 		return;
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
index 5501d8e..ae9ae51 100644
--- a/providers/hns/hns_roce_u.h
+++ b/providers/hns/hns_roce_u.h
@@ -409,6 +409,7 @@ struct hns_roce_qp {
 	uint8_t				sl;
 	uint8_t				tc_mode;
 	uint8_t				priority;
+	uint8_t				pageshift;
 	unsigned int			qkey;
 	enum ibv_mtu			path_mtu;
 
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
index 7b58dd0..f76341c 100644
--- a/providers/hns/hns_roce_u_verbs.c
+++ b/providers/hns/hns_roce_u_verbs.c
@@ -1327,31 +1327,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf)
 	}
 }
 
+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev,
+					   struct hns_roce_context *ctx,
+					   struct hns_roce_qp *qp, bool dca_en)
+{
+	uint32_t ext_sge_size;
+	uint32_t sq_size;
+	uint32_t rq_size;
+	uint8_t pg_shift;
+
+	if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ) || dca_en) {
+		qp->pageshift = HNS_HW_PAGE_SHIFT;
+		return;
+	}
+
+	/*
+	* The larger the pagesize used, the better the performance, but it
+	* may waste more memory. Therefore, we use the least common multiple
+	* (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size,
+	* and ext_sge buffer size as the pagesize. Additionally, since the
+	* kernel cannot guarantee the allocation of contiguous memory larger
+	* than the system page, the pagesize must be smaller than the system
+	* page.
+	*/
+	sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+	ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift;
+	rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+
+	pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0,
+			 ext_sge_size ? hr_ilog32(ext_sge_size) : 0);
+	pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0);
+	pg_shift = max_t(uint8_t, pg_shift,  HNS_HW_PAGE_SHIFT);
+	qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size));
+}
+
 static int calc_qp_buff_size(struct hns_roce_device *hr_dev,
-			     struct hns_roce_qp *qp)
+			     struct hns_roce_context *ctx,
+			     struct hns_roce_qp *qp, bool dca_en)
 {
 	struct hns_roce_wq *sq = &qp->sq;
 	struct hns_roce_wq *rq = &qp->rq;
+	unsigned int page_size;
 	unsigned int size;
 
 	qp->buf_size = 0;
+	get_best_multi_region_pg_shift(hr_dev, ctx, qp, dca_en);
+	page_size = 1 << qp->pageshift;
 
 	/* SQ WQE */
 	sq->offset = 0;
-	size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift);
+	size = align(sq->wqe_cnt << sq->wqe_shift, page_size);
 	qp->buf_size += size;
 
 	/* extend SGE WQE in SQ */
 	qp->ex_sge.offset = qp->buf_size;
 	if (qp->ex_sge.sge_cnt > 0) {
-		size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt,
-					      qp->ex_sge.sge_shift);
+		size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift,
+			     page_size);
 		qp->buf_size += size;
 	}
 
 	/* RQ WQE */
 	rq->offset = qp->buf_size;
-	size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift);
+	size = align(rq->wqe_cnt << rq->wqe_shift, page_size);
 	qp->buf_size += size;
 
 	if (qp->buf_size < 1)
@@ -1375,7 +1413,7 @@ static inline bool check_qp_support_dca(struct hns_roce_dca_ctx *dca_ctx,
 	if (hns_attr &&
 	    (hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) &&
 	    (hns_attr->create_flags & HNSDV_QP_CREATE_ENABLE_DCA_MODE))
-		return true;
+		return dca_ctx->max_size > 0;
 
 	return false;
 }
@@ -1396,9 +1434,12 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,
 			struct hns_roce_qp *qp, struct hns_roce_context *ctx)
 {
 	struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);
+	bool dca_en = check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr);
+	int ret;
 
-	if (calc_qp_buff_size(hr_dev, qp))
-		return -EINVAL;
+	ret = calc_qp_buff_size(hr_dev, ctx, qp, dca_en);
+	if (ret)
+		return ret;
 
 	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
 	if (!qp->sq.wrid)
@@ -1416,19 +1457,18 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,
 			goto err_alloc;
 	}
 
-	if (check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr) &&
-	    ctx->dca_ctx.max_size > 0) {
+	if (dca_en) {
 		/* when DCA is enabled, use a buffer list to store page addr */
 		qp->buf.buf = NULL;
 		qp->dca_wqe.max_cnt = hr_hw_page_count(qp->buf_size);
-		qp->dca_wqe.shift = HNS_HW_PAGE_SHIFT;
+		qp->dca_wqe.shift = qp->pageshift;
 		qp->dca_wqe.bufs = calloc(qp->dca_wqe.max_cnt, sizeof(void *));
 		if (!qp->dca_wqe.bufs)
 			goto err_alloc;
 		verbs_debug(&ctx->ibv_ctx, "alloc DCA buf.\n");
 	} else {
 		if (hns_roce_alloc_buf(&qp->buf, qp->buf_size,
-				       HNS_HW_PAGE_SIZE))
+				       1 << qp->pageshift))
 			goto err_alloc;
 	}
 
@@ -1642,6 +1682,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
 	cmd_ex.buf_addr = (uintptr_t)qp->buf.buf;
 	cmd_ex.log_sq_stride = qp->sq.wqe_shift;
 	cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt);
+	cmd_ex.pageshift = qp->pageshift;
 
 	if (cmd_flag->congest_type_flags) {
 		cmd_ex.comp_mask |= HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE;
-- 
2.25.1
Support flexible WQE buffer page size In order to improve performance, we allow user-mode drivers to use a larger page size to allocate WQE buffers, thereby reducing the latency introduced by HW page switching. User-mode drivers will be allowed to allocate WQE buffers between 4K to system page size. During ibv_create_qp(), the driver will dynamically select the appropriate page size based on ibv_qp_cap, thus reducing memory consumption while improving performance. Signed-off-by: Ran Zhou <zhouran10@h-partners.com> (cherry picked from commit 1a21f45d978a8c469d128838bfd6ef5a72d335e8) 2023-10-24 14:56:44 +08:00			`From d628c51d25b972a7d26e53ea400b3a0679d51f91 Mon Sep 17 00:00:00 2001`
			`From: Chengchang Tang <tangchengchang@huawei.com>`
			`Date: Mon, 23 Oct 2023 21:13:03 +0800`
			`Subject: [PATCH] libhns: Support flexible WQE buffer page size`

			`driver inclusion`
			`category: feature`
			`bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM`

			`--------------------------------------------------------------------------`

			`Currently, driver fixedly allocates 4K pages for user space WQE buffer`
			`even in a 64K system. This results in HW reading WQE with a granularity`
			`of 4K even in a 64K system. Considering that we support 1024-byte inline,`
			`in the scenario of using SQ inline, HW will switch pages every 4 WQEs.`
			`This will introduce a delay of about 400ns, which is an average delay of`
			`100ns per packet.`

			`In order to improve performance, we allow user-mode drivers to use a`
			`larger page size to allocate WQE buffers, thereby reducing the latency`
			`introduced by HW page switching. User-mode drivers will be allowed to`
			`allocate WQE buffers between 4K to system page size. During`
			`ibv_create_qp(), the driver will dynamically select the appropriate page`
			`size based on ibv_qp_cap, thus reducing memory consumption while improving`
			`performance.`

			`This feature needs to be used in conjunction with the kernel-mode driver.`
			`In order to ensure forward compatibility, if the kernel-mode driver does`
			`not support this feature, the user-mode driver will continue to use a`
			`fixed 4K pagesize to allocate WQE buffer.`

			`Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>`
			`---`
			`kernel-headers/rdma/hns-abi.h \| 5 ++-`
			`providers/hns/hns_roce_u.c \| 2 +-`
			`providers/hns/hns_roce_u.h \| 1 +`
			`providers/hns/hns_roce_u_verbs.c \| 65 ++++++++++++++++++++++++++------`
			`4 files changed, 59 insertions(+), 14 deletions(-)`

			`diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h`
			`index cab941f..157dc9d 100644`
			`--- a/kernel-headers/rdma/hns-abi.h`
			`+++ b/kernel-headers/rdma/hns-abi.h`
			`@@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp {`
			`__u8 log_sq_bb_count;`
			`__u8 log_sq_stride;`
			`__u8 sq_no_prefetch;`
			`- __u8 reserved[5];`
			`+ __u8 reserved[4];`
			`+ __u8 pageshift;`
			`__aligned_u64 sdb_addr;`
			`__aligned_u64 comp_mask;`
			`__aligned_u64 create_flags;`
			`@@ -122,6 +123,7 @@ enum {`
			`HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,`
			`HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,`
			`HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3,`
			`+ HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4,`
			`};`

			`enum {`
			`@@ -129,6 +131,7 @@ enum {`
			`HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,`
			`HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,`
			`HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA,`
			`+ HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ,`
			`};`

			`struct hns_roce_ib_alloc_ucontext_resp {`
			`diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c`
			`index 0660081..02ad880 100644`
			`--- a/providers/hns/hns_roce_u.c`
			`+++ b/providers/hns/hns_roce_u.c`
			`@@ -267,7 +267,7 @@ static void ucontext_set_cmd(struct hns_roce_alloc_ucontext *cmd,`
			`struct hnsdv_context_attr *attr)`
			`{`
			`cmd->config \|= HNS_ROCE_EXSGE_FLAGS \| HNS_ROCE_RQ_INLINE_FLAGS \|`
			`- HNS_ROCE_CQE_INLINE_FLAGS;`
			`+ HNS_ROCE_CQE_INLINE_FLAGS \| HNS_ROCE_UCTX_DYN_QP_PGSZ;`

			`if (!attr \|\| !(attr->flags & HNSDV_CONTEXT_FLAGS_DCA))`
			`return;`
			`diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h`
			`index 5501d8e..ae9ae51 100644`
			`--- a/providers/hns/hns_roce_u.h`
			`+++ b/providers/hns/hns_roce_u.h`
			`@@ -409,6 +409,7 @@ struct hns_roce_qp {`
			`uint8_t sl;`
			`uint8_t tc_mode;`
			`uint8_t priority;`
			`+ uint8_t pageshift;`
			`unsigned int qkey;`
			`enum ibv_mtu path_mtu;`

			`diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c`
			`index 7b58dd0..f76341c 100644`
			`--- a/providers/hns/hns_roce_u_verbs.c`
			`+++ b/providers/hns/hns_roce_u_verbs.c`
			`@@ -1327,31 +1327,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf)`
			`}`
			`}`

			`+static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev,`
			`+ struct hns_roce_context *ctx,`
			`+ struct hns_roce_qp *qp, bool dca_en)`
			`+{`
			`+ uint32_t ext_sge_size;`
			`+ uint32_t sq_size;`
			`+ uint32_t rq_size;`
			`+ uint8_t pg_shift;`
			`+`
			`+ if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ) \|\| dca_en) {`
			`+ qp->pageshift = HNS_HW_PAGE_SHIFT;`
			`+ return;`
			`+ }`
			`+`
			`+ /*`
			`+ * The larger the pagesize used, the better the performance, but it`
			`+ * may waste more memory. Therefore, we use the least common multiple`
			`+ * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size,`
			`+ * and ext_sge buffer size as the pagesize. Additionally, since the`
			`+ * kernel cannot guarantee the allocation of contiguous memory larger`
			`+ * than the system page, the pagesize must be smaller than the system`
			`+ * page.`
			`+ */`
			`+ sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift;`
			`+ ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift;`
			`+ rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;`
			`+`
			`+ pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0,`
			`+ ext_sge_size ? hr_ilog32(ext_sge_size) : 0);`
			`+ pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0);`
			`+ pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT);`
			`+ qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size));`
			`+}`
			`+`
			`static int calc_qp_buff_size(struct hns_roce_device *hr_dev,`
			`- struct hns_roce_qp *qp)`
			`+ struct hns_roce_context *ctx,`
			`+ struct hns_roce_qp *qp, bool dca_en)`
			`{`
			`struct hns_roce_wq *sq = &qp->sq;`
			`struct hns_roce_wq *rq = &qp->rq;`
			`+ unsigned int page_size;`
			`unsigned int size;`

			`qp->buf_size = 0;`
			`+ get_best_multi_region_pg_shift(hr_dev, ctx, qp, dca_en);`
			`+ page_size = 1 << qp->pageshift;`

			`/* SQ WQE */`
			`sq->offset = 0;`
			`- size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift);`
			`+ size = align(sq->wqe_cnt << sq->wqe_shift, page_size);`
			`qp->buf_size += size;`

			`/* extend SGE WQE in SQ */`
			`qp->ex_sge.offset = qp->buf_size;`
			`if (qp->ex_sge.sge_cnt > 0) {`
			`- size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt,`
			`- qp->ex_sge.sge_shift);`
			`+ size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift,`
			`+ page_size);`
			`qp->buf_size += size;`
			`}`

			`/* RQ WQE */`
			`rq->offset = qp->buf_size;`
			`- size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift);`
			`+ size = align(rq->wqe_cnt << rq->wqe_shift, page_size);`
			`qp->buf_size += size;`

			`if (qp->buf_size < 1)`
			`@@ -1375,7 +1413,7 @@ static inline bool check_qp_support_dca(struct hns_roce_dca_ctx *dca_ctx,`
			`if (hns_attr &&`
			`(hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) &&`
			`(hns_attr->create_flags & HNSDV_QP_CREATE_ENABLE_DCA_MODE))`
			`- return true;`
			`+ return dca_ctx->max_size > 0;`

			`return false;`
			`}`
			`@@ -1396,9 +1434,12 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,`
			`struct hns_roce_qp qp, struct hns_roce_context ctx)`
			`{`
			`struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);`
			`+ bool dca_en = check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr);`
			`+ int ret;`

			`- if (calc_qp_buff_size(hr_dev, qp))`
			`- return -EINVAL;`
			`+ ret = calc_qp_buff_size(hr_dev, ctx, qp, dca_en);`
			`+ if (ret)`
			`+ return ret;`

			`qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));`
			`if (!qp->sq.wrid)`
			`@@ -1416,19 +1457,18 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,`
			`goto err_alloc;`
			`}`

			`- if (check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr) &&`
			`- ctx->dca_ctx.max_size > 0) {`
			`+ if (dca_en) {`
			`/* when DCA is enabled, use a buffer list to store page addr */`
			`qp->buf.buf = NULL;`
			`qp->dca_wqe.max_cnt = hr_hw_page_count(qp->buf_size);`
			`- qp->dca_wqe.shift = HNS_HW_PAGE_SHIFT;`
			`+ qp->dca_wqe.shift = qp->pageshift;`
			`qp->dca_wqe.bufs = calloc(qp->dca_wqe.max_cnt, sizeof(void *));`
			`if (!qp->dca_wqe.bufs)`
			`goto err_alloc;`
			`verbs_debug(&ctx->ibv_ctx, "alloc DCA buf.\n");`
			`} else {`
			`if (hns_roce_alloc_buf(&qp->buf, qp->buf_size,`
			`- HNS_HW_PAGE_SIZE))`
			`+ 1 << qp->pageshift))`
			`goto err_alloc;`
			`}`

			`@@ -1642,6 +1682,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,`
			`cmd_ex.buf_addr = (uintptr_t)qp->buf.buf;`
			`cmd_ex.log_sq_stride = qp->sq.wqe_shift;`
			`cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt);`
			`+ cmd_ex.pageshift = qp->pageshift;`

			`if (cmd_flag->congest_type_flags) {`
			`cmd_ex.comp_mask \|= HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE;`
			`--`
			`2.25.1`