* [PATCH v2 1/4] io_uring: add REQ_F_IOPOLL
2026-02-19 1:43 [PATCH v2 0/4] io_uring/uring_cmd: allow non-iopoll cmds with IORING_SETUP_IOPOLL Caleb Sander Mateos
@ 2026-02-19 1:43 ` Caleb Sander Mateos
2026-02-19 1:43 ` [PATCH v2 2/4] io_uring: remove iopoll_queue from struct io_issue_def Caleb Sander Mateos
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Caleb Sander Mateos @ 2026-02-19 1:43 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig, Keith Busch, Sagi Grimberg
Cc: io-uring, linux-nvme, linux-kernel, Caleb Sander Mateos
A subsequent commit will allow uring_cmds to commands that don't
implement ->uring_cmd_iopoll() to be issued to IORING_SETUP_IOPOLL
io_urings. This means the ctx's IORING_SETUP_IOPOLL flag isn't
sufficient to determine whether a given request needs to be iopolled.
Introduce a request flag REQ_F_IOPOLL set in ->issue() if a request
needs to be iopolled to completion. Set the flag in io_rw_init_file()
and io_uring_cmd() for requests issued to IORING_SETUP_IOPOLL ctxs. Use
the request flag instead of IORING_SETUP_IOPOLL in places dealing with a
specific request.
A future possibility would be to add an option to enable/disable iopoll
in the io_uring SQE instead of determining it from IORING_SETUP_IOPOLL.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
include/linux/io_uring_types.h | 3 +++
io_uring/io_uring.c | 9 ++++-----
io_uring/rw.c | 11 ++++++-----
io_uring/uring_cmd.c | 5 +++--
4 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3e4a82a6f817..4563e1fafdf0 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -541,10 +541,11 @@ enum {
REQ_F_BUFFERS_COMMIT_BIT,
REQ_F_BUF_NODE_BIT,
REQ_F_HAS_METADATA_BIT,
REQ_F_IMPORT_BUFFER_BIT,
REQ_F_SQE_COPIED_BIT,
+ REW_F_IOPOLL_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
@@ -632,10 +633,12 @@ enum {
* For SEND_ZC, whether to import buffers (i.e. the first issue).
*/
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
/* ->sqe_copy() has been called, if necessary */
REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
+ /* request must be iopolled to completion (set in ->issue()) */
+ REQ_F_IOPOLL = IO_REQ_FLAG(REW_F_IOPOLL_BIT),
};
struct io_tw_req {
struct io_kiocb *req;
};
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ccab8562d273..43059f6e10e0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -354,11 +354,10 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
}
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
- struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CREDS)) {
req->flags |= REQ_F_CREDS;
req->creds = get_current_cred();
}
@@ -376,11 +375,11 @@ static void io_prep_async_work(struct io_kiocb *req)
/* don't serialize this request if the fs doesn't need it */
if (should_hash && (req->file->f_flags & O_DIRECT) &&
(req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
should_hash = false;
- if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
+ if (should_hash || (req->flags & REQ_F_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags);
}
@@ -1417,11 +1416,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (ret == IOU_ISSUE_SKIP_COMPLETE) {
ret = 0;
/* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
+ if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue)
io_iopoll_req_issued(req, issue_flags);
}
return ret;
}
@@ -1433,11 +1432,11 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)
int ret;
io_tw_lock(req->ctx, tw);
WARN_ON_ONCE(!req->file);
- if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL))
return -EFAULT;
ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE);
@@ -1531,11 +1530,11 @@ void io_wq_submit_work(struct io_wq_work *work)
* We can get EAGAIN for iopolled IO even though we're
* forcing a sync submission from here, since we can't
* wait for request slots on the block side.
*/
if (!needs_poll) {
- if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (!(req->flags & REQ_F_IOPOLL))
break;
if (io_wq_worker_stopped())
break;
cond_resched();
continue;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1a5f262734e8..3bdb9914e673 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -502,11 +502,11 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
- !(ctx->flags & IORING_SETUP_IOPOLL)))
+ !(req->flags & REQ_F_IOPOLL)))
return false;
/*
* If ref is dying, we might be running poll reap from the exit work.
* Don't attempt to reissue from that path, just let it fail with
* -EAGAIN.
@@ -638,11 +638,11 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
ret = -EINTR;
break;
}
}
- if (req->ctx->flags & IORING_SETUP_IOPOLL)
+ if (req->flags & REQ_F_IOPOLL)
io_complete_rw_iopoll(&rw->kiocb, ret);
else
io_complete_rw(&rw->kiocb, ret);
}
@@ -652,11 +652,11 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned final_ret = io_fixup_rw_res(req, ret);
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
- if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
+ if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) {
u32 cflags = 0;
__io_complete_rw_common(req, ret);
/*
* Safe to call io_end from here as we're inline
@@ -874,10 +874,11 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
req->flags |= REQ_F_NOWAIT;
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
+ req->flags |= REQ_F_IOPOLL;
kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
req->iopoll_completed = 0;
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
/* make sure every req only blocks once*/
@@ -961,11 +962,11 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
if (ret == -EAGAIN) {
/* If we can poll, just do that. */
if (io_file_can_poll(req))
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
- if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (!force_nonblock && !(req->flags & REQ_F_IOPOLL))
goto done;
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
goto done;
ret = 0;
@@ -1186,11 +1187,11 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL))
goto ret_eagain;
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
req->cqe.res, ret2);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ee7b49f47cb5..b651c63f6e20 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -108,11 +108,11 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
* Doing cancelations on IOPOLL requests are not supported. Both
* because they can't get canceled in the block stack, but also
* because iopoll completion data overlaps with the hash_node used
* for tracking.
*/
- if (ctx->flags & IORING_SETUP_IOPOLL)
+ if (req->flags & REQ_F_IOPOLL)
return;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
cmd->flags |= IORING_URING_CMD_CANCELABLE;
io_ring_submit_lock(ctx, issue_flags);
@@ -165,11 +165,11 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
if (req->ctx->flags & IORING_SETUP_CQE_MIXED)
req->cqe.flags |= IORING_CQE_F_32;
io_req_set_cqe32_extra(req, res2, 0);
}
io_req_uring_cleanup(req, issue_flags);
- if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+ if (req->flags & REQ_F_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
smp_store_release(&req->iopoll_completed, 1);
} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED))
return;
@@ -258,10 +258,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
if (io_is_compat(ctx))
issue_flags |= IO_URING_F_COMPAT;
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!file->f_op->uring_cmd_iopoll)
return -EOPNOTSUPP;
+ req->flags |= REQ_F_IOPOLL;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
/* make sure every req only blocks once */
req->flags &= ~REQ_F_IOPOLL_STATE;
--
2.45.2
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH v2 2/4] io_uring: remove iopoll_queue from struct io_issue_def
2026-02-19 1:43 [PATCH v2 0/4] io_uring/uring_cmd: allow non-iopoll cmds with IORING_SETUP_IOPOLL Caleb Sander Mateos
2026-02-19 1:43 ` [PATCH v2 1/4] io_uring: add REQ_F_IOPOLL Caleb Sander Mateos
@ 2026-02-19 1:43 ` Caleb Sander Mateos
2026-02-19 1:43 ` [PATCH v2 3/4] io_uring/uring_cmd: allow non-iopoll cmds with IORING_SETUP_IOPOLL Caleb Sander Mateos
2026-02-19 1:43 ` [PATCH v2 4/4] nvme: remove nvme_dev_uring_cmd() IO_URING_F_IOPOLL check Caleb Sander Mateos
3 siblings, 0 replies; 5+ messages in thread
From: Caleb Sander Mateos @ 2026-02-19 1:43 UTC (permalink / raw)
To: Jens Axboe, Christoph Hellwig, Keith Busch, Sagi Grimberg
Cc: io-uring, linux-nvme, linux-kernel, Caleb Sander Mateos
The opcode iopoll_queue flag is now redundant with REQ_F_IOPOLL. Only
io_{read,write}{,_fixed}() and io_uring_cmd() set the REQ_F_IOPOLL flag,
and the opcodes with these ->issue() implementations are precisely the
ones that set iopoll_queue. So don't bother checking the iopoll_queue
flag in io_issue_sqe(). Remove the unused flag from struct io_issue_def.
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
io_uring/io_uring.c | 3 +--
io_uring/opdef.c | 10 ----------
io_uring/opdef.h | 2 --
3 files changed, 1 insertion(+), 14 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 43059f6e10e0..2be46e11e1a7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1415,12 +1415,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
}
if (ret == IOU_ISSUE_SKIP_COMPLETE) {
ret = 0;
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue)
+ if (req->flags & REQ_F_IOPOLL)
io_iopoll_req_issued(req, issue_flags);
}
return ret;
}
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 91a23baf415e..4b3fb19b0cde 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -65,11 +65,10 @@ const struct io_issue_def io_issue_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv,
.issue = io_read,
},
@@ -80,11 +79,10 @@ const struct io_issue_def io_issue_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev,
.issue = io_write,
},
@@ -100,11 +98,10 @@ const struct io_issue_def io_issue_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_read_fixed,
.issue = io_read_fixed,
},
[IORING_OP_WRITE_FIXED] = {
@@ -114,11 +111,10 @@ const struct io_issue_def io_issue_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_write_fixed,
.issue = io_write_fixed,
},
[IORING_OP_POLL_ADD] = {
@@ -248,11 +244,10 @@ const struct io_issue_def io_issue_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_read,
.issue = io_read,
},
[IORING_OP_WRITE] = {
@@ -262,11 +257,10 @@ const struct io_issue_def io_issue_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_write,
.issue = io_write,
},
[IORING_OP_FADVISE] = {
@@ -421,11 +415,10 @@ const struct io_issue_def io_issue_defs[] = {
[IORING_OP_URING_CMD] = {
.buffer_select = 1,
.needs_file = 1,
.plug = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
},
[IORING_OP_SEND_ZC] = {
@@ -554,11 +547,10 @@ const struct io_issue_def io_issue_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv_fixed,
.issue = io_read,
},
@@ -569,11 +561,10 @@ const struct io_issue_def io_issue_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev_fixed,
.issue = io_write,
},
@@ -591,11 +582,10 @@ const struct io_issue_def io_issue_defs[] = {
[IORING_OP_URING_CMD128] = {
.buffer_select = 1,
.needs_file = 1,
.plug = 1,
.iopoll = 1,
- .iopoll_queue = 1,
.is_128 = 1,
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
},
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index faf3955dce8b..667f981e63b0 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -23,12 +23,10 @@ struct io_issue_def {
unsigned pollin : 1;
unsigned pollout : 1;
unsigned poll_exclusive : 1;
/* skip auditing */
unsigned audit_skip : 1;
- /* have to be put into the iopoll list */
- unsigned iopoll_queue : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
/* set to 1 if this opcode uses 128b sqes in a mixed sq */
unsigned is_128 : 1;
--
2.45.2
^ permalink raw reply related [flat|nested] 5+ messages in thread