* [PATCH V2 1/2] block: pass io_comp_batch to rq_end_io_fn callback
2026-01-16 7:46 ` [PATCH V2 0/2] nvme: optimize passthrough IOPOLL completion for local ring context Ming Lei
@ 2026-01-16 7:46 ` Ming Lei
2026-01-16 7:46 ` [PATCH V2 2/2] nvme/io_uring: optimize IOPOLL completions for local ring context Ming Lei
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2026-01-16 7:46 UTC (permalink / raw)
To: Jens Axboe, io-uring, Keith Busch; +Cc: linux-block, linux-nvme, Ming Lei
Add a third parameter 'const struct io_comp_batch *' to the rq_end_io_fn
callback signature. This allows end_io handlers to access the completion
batch context when requests are completed via blk_mq_end_request_batch().
The io_comp_batch is passed from blk_mq_end_request_batch(), while NULL
is passed from __blk_mq_end_request() and blk_mq_put_rq_ref() which don't
have batch context.
This infrastructure change enables drivers to detect whether they're
being called from a batched completion path (like iopoll) and access
additional context stored in the io_comp_batch.
Update all rq_end_io_fn implementations:
- block/blk-mq.c: blk_end_sync_rq
- block/blk-flush.c: flush_end_io, mq_flush_data_end_io
- drivers/nvme/host/ioctl.c: nvme_uring_cmd_end_io
- drivers/nvme/host/core.c: nvme_keep_alive_end_io
- drivers/nvme/host/pci.c: abort_endio, nvme_del_queue_end, nvme_del_cq_end
- drivers/nvme/target/passthru.c: nvmet_passthru_req_done
- drivers/scsi/scsi_error.c: eh_lock_door_done
- drivers/scsi/sg.c: sg_rq_end_io
- drivers/scsi/st.c: st_scsi_execute_end
- drivers/target/target_core_pscsi.c: pscsi_req_done
- drivers/md/dm-rq.c: end_clone_request
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-flush.c | 6 ++++--
block/blk-mq.c | 9 +++++----
drivers/md/dm-rq.c | 3 ++-
drivers/nvme/host/core.c | 3 ++-
drivers/nvme/host/ioctl.c | 3 ++-
drivers/nvme/host/pci.c | 11 +++++++----
drivers/nvme/target/passthru.c | 3 ++-
drivers/scsi/scsi_error.c | 3 ++-
drivers/scsi/sg.c | 6 ++++--
drivers/scsi/st.c | 3 ++-
drivers/target/target_core_pscsi.c | 6 ++++--
include/linux/blk-mq.h | 4 +++-
12 files changed, 39 insertions(+), 21 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43d6152897a4..403a46c86411 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq,
}
static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
- blk_status_t error)
+ blk_status_t error,
+ const struct io_comp_batch *iob)
{
struct request_queue *q = flush_rq->q;
struct list_head *running;
@@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
}
static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
- blk_status_t error)
+ blk_status_t error,
+ const struct io_comp_batch *iob)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a29d8ac9d3e3..cf1daedbb39f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1156,7 +1156,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
if (rq->end_io) {
rq_qos_done(rq->q, rq);
- if (rq->end_io(rq, error) == RQ_END_IO_FREE)
+ if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE)
blk_mq_free_request(rq);
} else {
blk_mq_free_request(rq);
@@ -1211,7 +1211,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
* If end_io handler returns NONE, then it still has
* ownership of the request.
*/
- if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
+ if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE)
continue;
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -1458,7 +1458,8 @@ struct blk_rq_wait {
blk_status_t ret;
};
-static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
+static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret,
+ const struct io_comp_batch *iob)
{
struct blk_rq_wait *wait = rq->end_io_data;
@@ -1688,7 +1689,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi
void blk_mq_put_rq_ref(struct request *rq)
{
if (is_flush_rq(rq)) {
- if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
+ if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE)
blk_mq_free_request(rq);
} else if (req_ref_put_and_test(rq)) {
__blk_mq_free_request(rq);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index a6ca92049c10..e9a7563b4b2f 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
}
static enum rq_end_io_ret end_clone_request(struct request *clone,
- blk_status_t error)
+ blk_status_t error,
+ const struct io_comp_batch *iob)
{
struct dm_rq_target_io *tio = clone->end_io_data;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7bf228df6001..19b67cf5d550 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
}
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
- blk_status_t status)
+ blk_status_t status,
+ const struct io_comp_batch *iob)
{
struct nvme_ctrl *ctrl = rq->end_io_data;
unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..e45ac0ca174e 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw)
}
static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
- blk_status_t err)
+ blk_status_t err,
+ const struct io_comp_batch *iob)
{
struct io_uring_cmd *ioucmd = req->end_io_data;
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b78c55a8f38c..08c8a941f49e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1618,7 +1618,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}
-static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
+static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error,
+ const struct io_comp_batch *iob)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
@@ -2861,7 +2862,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
}
static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
- blk_status_t error)
+ blk_status_t error,
+ const struct io_comp_batch *iob)
{
struct nvme_queue *nvmeq = req->end_io_data;
@@ -2871,14 +2873,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
}
static enum rq_end_io_ret nvme_del_cq_end(struct request *req,
- blk_status_t error)
+ blk_status_t error,
+ const struct io_comp_batch *iob)
{
struct nvme_queue *nvmeq = req->end_io_data;
if (error)
set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
- return nvme_del_queue_end(req, error);
+ return nvme_del_queue_end(req, error, iob);
}
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 67c423a8b052..5d541c2a46a5 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
}
static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq,
- blk_status_t blk_status)
+ blk_status_t blk_status,
+ const struct io_comp_batch *iob)
{
struct nvmet_req *req = rq->end_io_data;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f869108fd969..1e93390c5a82 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2085,7 +2085,8 @@ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd)
}
static enum rq_end_io_ret eh_lock_door_done(struct request *req,
- blk_status_t status)
+ blk_status_t status,
+ const struct io_comp_batch *iob)
{
blk_mq_free_request(req);
return RQ_END_IO_NONE;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 57fba34832ad..1a521f9d821a 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
} Sg_device;
/* tasklet or soft irq callback */
-static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status);
+static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status,
+ const struct io_comp_batch *iob);
static int sg_start_req(Sg_request *srp, unsigned char *cmd);
static int sg_finish_rem_req(Sg_request * srp);
static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size);
@@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work)
* level when a command is completed (or has failed).
*/
static enum rq_end_io_ret
-sg_rq_end_io(struct request *rq, blk_status_t status)
+sg_rq_end_io(struct request *rq, blk_status_t status,
+ const struct io_comp_batch *iob)
{
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
struct sg_request *srp = rq->end_io_data;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 168f25e4aaa3..8aeaa3b68c25 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
}
static enum rq_end_io_ret st_scsi_execute_end(struct request *req,
- blk_status_t status)
+ blk_status_t status,
+ const struct io_comp_batch *iob)
{
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
struct st_request *SRpnt = req->end_io_data;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index db4e09042469..823b2665f95b 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev)
}
static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd);
-static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t);
+static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t,
+ const struct io_comp_batch *);
/* pscsi_attach_hba():
*
@@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
}
static enum rq_end_io_ret pscsi_req_done(struct request *req,
- blk_status_t status)
+ blk_status_t status,
+ const struct io_comp_batch *iob)
{
struct se_cmd *cmd = req->end_io_data;
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index cae9e857aea4..18a2388ba581 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -13,6 +13,7 @@
struct blk_mq_tags;
struct blk_flush_queue;
+struct io_comp_batch;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_DEFAULT_RQ 128
@@ -22,7 +23,8 @@ enum rq_end_io_ret {
RQ_END_IO_FREE,
};
-typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
+typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t,
+ const struct io_comp_batch *);
/*
* request flags */
--
2.47.0
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH V2 2/2] nvme/io_uring: optimize IOPOLL completions for local ring context
2026-01-16 7:46 ` [PATCH V2 0/2] nvme: optimize passthrough IOPOLL completion for local ring context Ming Lei
2026-01-16 7:46 ` [PATCH V2 1/2] block: pass io_comp_batch to rq_end_io_fn callback Ming Lei
@ 2026-01-16 7:46 ` Ming Lei
2026-01-19 15:07 ` [PATCH V2 0/2] nvme: optimize passthrough IOPOLL completion " Kanchan Joshi
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Ming Lei @ 2026-01-16 7:46 UTC (permalink / raw)
To: Jens Axboe, io-uring, Keith Busch; +Cc: linux-block, linux-nvme, Ming Lei
When multiple io_uring rings poll on the same NVMe queue, one ring can
find completions belonging to another ring. The current code always
uses task_work to handle this, but this adds overhead for the common
single-ring case.
This patch passes the polling io_ring_ctx through io_comp_batch's new
poll_ctx field. In io_do_iopoll(), the polling ring's context is stored
in iob.poll_ctx before calling the iopoll callbacks.
In nvme_uring_cmd_end_io(), we now compare iob->poll_ctx with the
request's owning io_ring_ctx (via io_uring_cmd_ctx_handle()). If they
match (local context), we complete inline with io_uring_cmd_done32().
If they differ (remote context) or iob is NULL (non-iopoll path), we
use task_work as before.
This optimization eliminates task_work scheduling overhead for the
common case where a ring polls and finds its own completions.
~10% IOPS improvement is observed in the following benchmark:
fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -O0 -P1 -u1 -n1 /dev/ng0n1
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/nvme/host/ioctl.c | 20 +++++++++++++-------
include/linux/blkdev.h | 1 +
io_uring/rw.c | 6 ++++++
3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index e45ac0ca174e..fb62633ccbb0 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -426,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * IOPOLL could potentially complete this request directly, but
- * if multiple rings are polling on the same queue, then it's possible
- * for one ring to find completions for another ring. Punting the
- * completion via task_work will always direct it to the right
- * location, rather than potentially complete requests for ringA
- * under iopoll invocations from ringB.
+ * For IOPOLL, check if this completion is happening in the context
+ * of the same io_ring that owns the request (local context). If so,
+ * we can complete inline without task_work overhead. Otherwise, we
+ * must punt to task_work to ensure completion happens in the correct
+ * ring's context.
*/
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+ if (blk_rq_is_poll(req) && iob &&
+ iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) {
+ if (pdu->bio)
+ blk_rq_unmap_user(pdu->bio);
+ io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0);
+ } else {
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+ }
return RQ_END_IO_FREE;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 438c4946b6e5..251e0f538c4c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1822,6 +1822,7 @@ struct io_comp_batch {
struct rq_list req_list;
bool need_ts;
void (*complete)(struct io_comp_batch *);
+ void *poll_ctx;
};
static inline bool blk_atomic_write_start_sect_aligned(sector_t sector,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c33c533a267e..4c81a5a89089 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1321,6 +1321,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
struct io_kiocb *req, *tmp;
int nr_events = 0;
+ /*
+ * Store the polling io_ring_ctx so drivers can detect if they're
+ * completing a request in the same ring context that's polling.
+ */
+ iob.poll_ctx = ctx;
+
/*
* Only spin for completions if we don't have multiple devices hanging
* off our complete list.
--
2.47.0
^ permalink raw reply related [flat|nested] 9+ messages in thread