103 lines
4.1 KiB
Diff
103 lines
4.1 KiB
Diff
|
|
From c4423b70160eb7ae91dac9f2cf61513758ee017d Mon Sep 17 00:00:00 2001
|
||
|
|
From: Klaus Jensen <k.jensen@samsung.com>
|
||
|
|
Date: Tue, 29 Oct 2024 13:15:19 +0100
|
||
|
|
Subject: [PATCH] hw/nvme: fix handling of over-committed queues
|
||
|
|
|
||
|
|
If a host chooses to use the SQHD "hint" in the CQE to know if there is
|
||
|
|
room in the submission queue for additional commands, it may result in a
|
||
|
|
situation where there are not enough internal resources (struct
|
||
|
|
NvmeRequest) available to process the command. For a lack of a better
|
||
|
|
term, the host may "over-commit" the device (i.e., it may have more
|
||
|
|
inflight commands than the queue size).
|
||
|
|
|
||
|
|
For example, assume a queue with N entries. The host submits N commands
|
||
|
|
and all are picked up for processing, advancing the head and emptying
|
||
|
|
the queue. Regardless of which of these N commands complete first, the
|
||
|
|
SQHD field of that CQE will indicate to the host that the queue is
|
||
|
|
empty, which allows the host to issue N commands again. However, if the
|
||
|
|
device has not posted CQEs for all the previous commands yet, the device
|
||
|
|
will have less than N resources available to process the commands, so
|
||
|
|
queue processing is suspended.
|
||
|
|
|
||
|
|
And here lies an 11 year latent bug. In the absense of any additional
|
||
|
|
tail updates on the submission queue, we never schedule the processing
|
||
|
|
bottom-half again unless we observe a head update on an associated full
|
||
|
|
completion queue. This has been sufficient to handle N-to-1 SQ/CQ setups
|
||
|
|
(in the absense of over-commit of course). Incidentially, that "kick all
|
||
|
|
associated SQs" mechanism can now be killed since we now just schedule
|
||
|
|
queue processing when we return a processing resource to a non-empty
|
||
|
|
submission queue, which happens to cover both edge cases. However, we
|
||
|
|
must retain kicking the CQ if it was previously full.
|
||
|
|
|
||
|
|
So, apparently, no previous driver tested with hw/nvme has ever used
|
||
|
|
SQHD (e.g., neither the Linux NVMe driver or SPDK uses it). But then OSv
|
||
|
|
shows up with the driver that actually does. I salute you.
|
||
|
|
|
||
|
|
Fixes: f3c507adcd7b ("NVMe: Initial commit for new storage interface")
|
||
|
|
Cc: qemu-stable@nongnu.org
|
||
|
|
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2388
|
||
|
|
Reported-by: Waldemar Kozaczuk <jwkozaczuk@gmail.com>
|
||
|
|
Reviewed-by: Keith Busch <kbusch@kernel.org>
|
||
|
|
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
|
||
|
|
Signed-off-by: Zhongrui Tang <tangzhongrui_yewu@cmss.chinamobile.com>
|
||
|
|
---
|
||
|
|
hw/nvme/ctrl.c | 21 ++++++++++++---------
|
||
|
|
1 file changed, 12 insertions(+), 9 deletions(-)
|
||
|
|
|
||
|
|
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
|
||
|
|
index 104aebc5ea..29445938d5 100644
|
||
|
|
--- a/hw/nvme/ctrl.c
|
||
|
|
+++ b/hw/nvme/ctrl.c
|
||
|
|
@@ -1516,9 +1516,16 @@ static void nvme_post_cqes(void *opaque)
|
||
|
|
stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
+
|
||
|
|
QTAILQ_REMOVE(&cq->req_list, req, entry);
|
||
|
|
+
|
||
|
|
nvme_inc_cq_tail(cq);
|
||
|
|
nvme_sg_unmap(&req->sg);
|
||
|
|
+
|
||
|
|
+ if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
|
||
|
|
+ qemu_bh_schedule(sq->bh);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
|
||
|
|
}
|
||
|
|
if (cq->tail != cq->head) {
|
||
|
|
@@ -7575,7 +7582,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
|
||
|
|
/* Completion queue doorbell write */
|
||
|
|
|
||
|
|
uint16_t new_head = val & 0xffff;
|
||
|
|
- int start_sqs;
|
||
|
|
NvmeCQueue *cq;
|
||
|
|
|
||
|
|
qid = (addr - (0x1000 + (1 << 2))) >> 3;
|
||
|
|
@@ -7626,18 +7632,15 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
|
||
|
|
|
||
|
|
trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
|
||
|
|
|
||
|
|
- start_sqs = nvme_cq_full(cq) ? 1 : 0;
|
||
|
|
+ /* scheduled deferred cqe posting if queue was previously full */
|
||
|
|
+ if (nvme_cq_full(cq)) {
|
||
|
|
+ qemu_bh_schedule(cq->bh);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
cq->head = new_head;
|
||
|
|
if (!qid && n->dbbuf_enabled) {
|
||
|
|
stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
|
||
|
|
}
|
||
|
|
- if (start_sqs) {
|
||
|
|
- NvmeSQueue *sq;
|
||
|
|
- QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
|
||
|
|
- qemu_bh_schedule(sq->bh);
|
||
|
|
- }
|
||
|
|
- qemu_bh_schedule(cq->bh);
|
||
|
|
- }
|
||
|
|
|
||
|
|
if (cq->tail == cq->head) {
|
||
|
|
if (cq->irq_enabled) {
|
||
|
|
--
|
||
|
|
2.41.0.windows.1
|
||
|
|
|