* [PATCH 1/4] block: add mq_ops->queue_rqs hook
2021-12-16 16:05 [PATCHSET v4 0/4] Add support for list issue Jens Axboe
@ 2021-12-16 16:05 ` Jens Axboe
2021-12-16 16:05 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
` (2 subsequent siblings)
3 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2021-12-16 16:05 UTC (permalink / raw)
To: io-uring, linux-block, linux-nvme; +Cc: Jens Axboe, Christoph Hellwig
If we have a list of requests in our plug list, send it to the driver in
one go, if possible. The driver must set mq_ops->queue_rqs() to support
this, if not the usual one-by-one path is used.
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-mq.c | 26 +++++++++++++++++++++++---
include/linux/blk-mq.h | 8 ++++++++
2 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 75154cc788db..51991232824a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2553,6 +2553,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct blk_mq_hw_ctx *this_hctx;
struct blk_mq_ctx *this_ctx;
+ struct request *rq;
unsigned int depth;
LIST_HEAD(list);
@@ -2561,7 +2562,28 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- struct request_queue *q = rq_list_peek(&plug->mq_list)->q;
+ struct request_queue *q;
+
+ rq = rq_list_peek(&plug->mq_list);
+ q = rq->q;
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole plug list in one go. We
+ * already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ *
+ * Since we pass off the full list to the driver at this point,
+ * we do not increment the active request count for the queue.
+ * Bypass shared tags for now because of that.
+ */
+ if (q->mq_ops->queue_rqs &&
+ !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ blk_mq_run_dispatch_ops(q,
+ q->mq_ops->queue_rqs(&plug->mq_list));
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
blk_mq_run_dispatch_ops(q,
blk_mq_plug_issue_direct(plug, false));
@@ -2573,8 +2595,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
this_ctx = NULL;
depth = 0;
do {
- struct request *rq;
-
rq = rq_list_pop(&plug->mq_list);
if (!this_hctx) {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 772f8f921526..550996cf419c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -492,6 +492,14 @@ struct blk_mq_ops {
*/
void (*commit_rqs)(struct blk_mq_hw_ctx *);
+ /**
+ * @queue_rqs: Queue a list of new requests. Driver is guaranteed
+ * that each request belongs to the same queue. If the driver doesn't
+ * empty the @rqlist completely, then the rest will be queued
+ * individually by the block layer upon return.
+ */
+ void (*queue_rqs)(struct request **rqlist);
+
/**
* @get_budget: Reserve budget before queue request, once .queue_rq is
* run, it is driver's responsibility to release the
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/4] nvme: split command copy into a helper
2021-12-16 16:05 [PATCHSET v4 0/4] Add support for list issue Jens Axboe
2021-12-16 16:05 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
@ 2021-12-16 16:05 ` Jens Axboe
2021-12-16 16:05 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
2021-12-16 16:05 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
3 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2021-12-16 16:05 UTC (permalink / raw)
To: io-uring, linux-block, linux-nvme
Cc: Jens Axboe, Chaitanya Kulkarni, Hannes Reinecke, Max Gurtovoy
We'll need it for batched submit as well.
Reviewed-by: Chaitanya Kulkarni <[email protected]>
Reviewed-by: Hannes Reinecke <[email protected]>
Reviewed-by: Max Gurtovoy <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/nvme/host/pci.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8637538f3fd5..09ea21f75439 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -500,6 +500,15 @@ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
nvmeq->last_sq_tail = nvmeq->sq_tail;
}
+static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq,
+ struct nvme_command *cmd)
+{
+ memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), cmd,
+ sizeof(*cmd));
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+}
+
/**
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
@@ -510,10 +519,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
- memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
- cmd, sizeof(*cmd));
- if (++nvmeq->sq_tail == nvmeq->q_depth)
- nvmeq->sq_tail = 0;
+ nvme_sq_copy_cmd(nvmeq, cmd);
nvme_write_sq_db(nvmeq, write_sq);
spin_unlock(&nvmeq->sq_lock);
}
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 3/4] nvme: separate command prep and issue
2021-12-16 16:05 [PATCHSET v4 0/4] Add support for list issue Jens Axboe
2021-12-16 16:05 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
2021-12-16 16:05 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
@ 2021-12-16 16:05 ` Jens Axboe
2021-12-16 16:05 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
3 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2021-12-16 16:05 UTC (permalink / raw)
To: io-uring, linux-block, linux-nvme
Cc: Jens Axboe, Hannes Reinecke, Christoph Hellwig
Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
adapted to use this helper.
Reviewed-by: Hannes Reinecke <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/nvme/host/pci.c | 57 ++++++++++++++++++++++++-----------------
1 file changed, 33 insertions(+), 24 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 09ea21f75439..6be6b1ab4285 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -918,52 +918,32 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
return BLK_STS_OK;
}
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
-static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
{
- struct nvme_ns *ns = hctx->queue->queuedata;
- struct nvme_queue *nvmeq = hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
- struct request *req = bd->rq;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_command *cmnd = &iod->cmd;
blk_status_t ret;
iod->aborted = 0;
iod->npages = -1;
iod->nents = 0;
- /*
- * We should not need to do this, but we're still using this to
- * ensure we can drain requests on a dying queue.
- */
- if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
- return BLK_STS_IOERR;
-
- if (!nvme_check_ready(&dev->ctrl, req, true))
- return nvme_fail_nonready_command(&dev->ctrl, req);
-
- ret = nvme_setup_cmd(ns, req);
+ ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
return ret;
if (blk_rq_nr_phys_segments(req)) {
- ret = nvme_map_data(dev, req, cmnd);
+ ret = nvme_map_data(dev, req, &iod->cmd);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req, cmnd);
+ ret = nvme_map_metadata(dev, req, &iod->cmd);
if (ret)
goto out_unmap_data;
}
blk_mq_start_request(req);
- nvme_submit_cmd(nvmeq, cmnd, bd->last);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
@@ -972,6 +952,35 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return ret;
}
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+ struct request *req = bd->rq;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ blk_status_t ret;
+
+ /*
+ * We should not need to do this, but we're still using this to
+ * ensure we can drain requests on a dying queue.
+ */
+ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+ return BLK_STS_IOERR;
+
+ if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
+ return nvme_fail_nonready_command(&dev->ctrl, req);
+
+ ret = nvme_prep_rq(dev, req);
+ if (unlikely(ret))
+ return ret;
+ nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
+ return BLK_STS_OK;
+}
+
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
2021-12-16 16:05 [PATCHSET v4 0/4] Add support for list issue Jens Axboe
` (2 preceding siblings ...)
2021-12-16 16:05 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
@ 2021-12-16 16:05 ` Jens Axboe
2021-12-16 16:11 ` [PATCH v2 " Jens Axboe
3 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2021-12-16 16:05 UTC (permalink / raw)
To: io-uring, linux-block, linux-nvme
Cc: Jens Axboe, Hannes Reinecke, Keith Busch
This enables the block layer to send us a full plug list of requests
that need submitting. The block layer guarantees that they all belong
to the same queue, but we do have to check the hardware queue mapping
for each request.
If errors are encountered, leave them in the passed in list. Then the
block layer will handle them individually.
This is good for about a 4% improvement in peak performance, taking us
from 9.6M to 10M IOPS/core.
Reviewed-by: Hannes Reinecke <[email protected]>
Reviewed-by: Keith Busch <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/nvme/host/pci.c | 58 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6be6b1ab4285..e34ad67c4c41 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -981,6 +981,63 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+{
+ spin_lock(&nvmeq->sq_lock);
+ while (!rq_list_empty(*rqlist)) {
+ struct request *req = rq_list_pop(rqlist);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ nvme_sq_copy_cmd(nvmeq, &iod->cmd);
+ }
+ nvme_write_sq_db(nvmeq, true);
+ spin_unlock(&nvmeq->sq_lock);
+}
+
+static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
+{
+ /*
+ * We should not need to do this, but we're still using this to
+ * ensure we can drain requests on a dying queue.
+ */
+ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+ return false;
+ if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
+ return false;
+
+ req->mq_hctx->tags->rqs[req->tag] = req;
+ return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+}
+
+static void nvme_queue_rqs(struct request **rqlist)
+{
+ struct request *req = rq_list_peek(rqlist), *prev = NULL;
+ struct request *requeue_list = NULL;
+
+ do {
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+
+ if (!nvme_prep_rq_batch(nvmeq, req)) {
+ /* detach 'req' and add to remainder list */
+ if (prev)
+ prev->rq_next = req->rq_next;
+ rq_list_add(&requeue_list, req);
+ } else {
+ prev = req;
+ }
+
+ req = rq_list_next(req);
+ if (!req || (prev && req->mq_hctx != prev->mq_hctx)) {
+ /* detach rest of list, and submit */
+ prev->rq_next = NULL;
+ nvme_submit_cmds(nvmeq, rqlist);
+ *rqlist = req;
+ }
+ } while (req);
+
+ *rqlist = requeue_list;
+}
+
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1678,6 +1735,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
+ .queue_rqs = nvme_queue_rqs,
.complete = nvme_pci_complete_rq,
.commit_rqs = nvme_commit_rqs,
.init_hctx = nvme_init_hctx,
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v2 4/4] nvme: add support for mq_ops->queue_rqs()
2021-12-16 16:05 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
@ 2021-12-16 16:11 ` Jens Axboe
0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2021-12-16 16:11 UTC (permalink / raw)
To: io-uring, linux-block, linux-nvme; +Cc: Hannes Reinecke, Keith Busch
This enables the block layer to send us a full plug list of requests
that need submitting. The block layer guarantees that they all belong
to the same queue, but we do have to check the hardware queue mapping
for each request.
If errors are encountered, leave them in the passed in list. Then the
block layer will handle them individually.
This is good for about a 4% improvement in peak performance, taking us
from 9.6M to 10M IOPS/core.
Reviewed-by: Hannes Reinecke <[email protected]>
Reviewed-by: Keith Busch <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
Added the prev check suggested by Max.
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6be6b1ab4285..58d97660374a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -981,6 +981,64 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+{
+ spin_lock(&nvmeq->sq_lock);
+ while (!rq_list_empty(*rqlist)) {
+ struct request *req = rq_list_pop(rqlist);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ nvme_sq_copy_cmd(nvmeq, &iod->cmd);
+ }
+ nvme_write_sq_db(nvmeq, true);
+ spin_unlock(&nvmeq->sq_lock);
+}
+
+static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
+{
+ /*
+ * We should not need to do this, but we're still using this to
+ * ensure we can drain requests on a dying queue.
+ */
+ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+ return false;
+ if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
+ return false;
+
+ req->mq_hctx->tags->rqs[req->tag] = req;
+ return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+}
+
+static void nvme_queue_rqs(struct request **rqlist)
+{
+ struct request *req = rq_list_peek(rqlist), *prev = NULL;
+ struct request *requeue_list = NULL;
+
+ do {
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+
+ if (!nvme_prep_rq_batch(nvmeq, req)) {
+ /* detach 'req' and add to remainder list */
+ if (prev)
+ prev->rq_next = req->rq_next;
+ rq_list_add(&requeue_list, req);
+ } else {
+ prev = req;
+ }
+
+ req = rq_list_next(req);
+ if (!req || (prev && req->mq_hctx != prev->mq_hctx)) {
+ /* detach rest of list, and submit */
+ if (prev)
+ prev->rq_next = NULL;
+ nvme_submit_cmds(nvmeq, rqlist);
+ *rqlist = req;
+ }
+ } while (req);
+
+ *rqlist = requeue_list;
+}
+
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1678,6 +1736,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
+ .queue_rqs = nvme_queue_rqs,
.complete = nvme_pci_complete_rq,
.commit_rqs = nvme_commit_rqs,
.init_hctx = nvme_init_hctx,
--
Jens Axboe
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 1/4] block: add mq_ops->queue_rqs hook
2021-12-16 16:38 [PATCHSET v5 0/4] Add support for list issue Jens Axboe
@ 2021-12-16 16:38 ` Jens Axboe
0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2021-12-16 16:38 UTC (permalink / raw)
To: io-uring, linux-block, linux-nvme; +Cc: Jens Axboe, Christoph Hellwig
If we have a list of requests in our plug list, send it to the driver in
one go, if possible. The driver must set mq_ops->queue_rqs() to support
this, if not the usual one-by-one path is used.
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-mq.c | 26 +++++++++++++++++++++++---
include/linux/blk-mq.h | 8 ++++++++
2 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 75154cc788db..51991232824a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2553,6 +2553,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct blk_mq_hw_ctx *this_hctx;
struct blk_mq_ctx *this_ctx;
+ struct request *rq;
unsigned int depth;
LIST_HEAD(list);
@@ -2561,7 +2562,28 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- struct request_queue *q = rq_list_peek(&plug->mq_list)->q;
+ struct request_queue *q;
+
+ rq = rq_list_peek(&plug->mq_list);
+ q = rq->q;
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole plug list in one go. We
+ * already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ *
+ * Since we pass off the full list to the driver at this point,
+ * we do not increment the active request count for the queue.
+ * Bypass shared tags for now because of that.
+ */
+ if (q->mq_ops->queue_rqs &&
+ !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ blk_mq_run_dispatch_ops(q,
+ q->mq_ops->queue_rqs(&plug->mq_list));
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
blk_mq_run_dispatch_ops(q,
blk_mq_plug_issue_direct(plug, false));
@@ -2573,8 +2595,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
this_ctx = NULL;
depth = 0;
do {
- struct request *rq;
-
rq = rq_list_pop(&plug->mq_list);
if (!this_hctx) {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 772f8f921526..550996cf419c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -492,6 +492,14 @@ struct blk_mq_ops {
*/
void (*commit_rqs)(struct blk_mq_hw_ctx *);
+ /**
+ * @queue_rqs: Queue a list of new requests. Driver is guaranteed
+ * that each request belongs to the same queue. If the driver doesn't
+ * empty the @rqlist completely, then the rest will be queued
+ * individually by the block layer upon return.
+ */
+ void (*queue_rqs)(struct request **rqlist);
+
/**
* @get_budget: Reserve budget before queue request, once .queue_rq is
* run, it is driver's responsibility to release the
--
2.34.1
^ permalink raw reply related [flat|nested] 11+ messages in thread