From: Jens Axboe <axboe@kernel.dk>
To: Ming Lei <ming.lei@redhat.com>,
io-uring@vger.kernel.org, Pavel Begunkov <asml.silence@gmail.com>
Cc: Caleb Sander Mateos <csander@purestorage.com>
Subject: Re: [PATCH V4] io_uring: uring_cmd: add multishot support
Date: Wed, 20 Aug 2025 20:24:25 -0600 [thread overview]
Message-ID: <2a3e4ad9-9679-43cf-b368-237167c2f544@kernel.dk> (raw)
In-Reply-To: <087e7eb4-c3bb-4953-be9f-d936377cf0d4@kernel.dk>
On 8/20/25 1:17 PM, Jens Axboe wrote:
> On 8/20/25 9:40 AM, Ming Lei wrote:
>> Add UAPI flag IORING_URING_CMD_MULTISHOT for supporting multishot
>> uring_cmd operations with provided buffer.
>>
>> This enables drivers to post multiple completion events from a single
>> uring_cmd submission, which is useful for:
>>
>> - Notifying userspace of device events (e.g., interrupt handling)
>> - Supporting devices with multiple event sources (e.g., multi-queue devices)
>> - Avoiding the need for device poll() support when events originate
>> from multiple sources device-wide
>>
>> The implementation adds two new APIs:
>> - io_uring_cmd_select_buffer(): selects a buffer from the provided
>> buffer group for multishot uring_cmd
>> - io_uring_mshot_cmd_post_cqe(): posts a CQE after event data is
>> pushed to the provided buffer
>>
>> Multishot uring_cmd must be used with buffer select (IOSQE_BUFFER_SELECT)
>> and is mutually exclusive with IORING_URING_CMD_FIXED for now.
>>
>> The ublk driver will be the first user of this functionality:
>>
>> https://github.com/ming1/linux/commits/ublk-devel/
>>
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> ---
>> V4:
>> - add io_do_buffer_select() check in io_uring_cmd_select_buffer(()
>> - comments that the two APIs should work together for committing buffer
>> upfront(Jens)
>>
>> V3:
>> - enhance buffer select check(Jens)
>>
>> V2:
>> - Fixed static inline return type
>> - Updated UAPI comments: Clarified that IORING_URING_CMD_MULTISHOT must be used with buffer select
>> - Refactored validation checks: Moved the mutual exclusion checks into the individual flag validation
>> sections for better code organization
>> - Added missing req_set_fail(): Added the missing failure handling in io_uring_mshot_cmd_post_cqe
>> - Improved commit message: Rewrote the commit message to be clearer, more technical, and better explain
>> the use cases and API changes
>>
>> include/linux/io_uring/cmd.h | 28 ++++++++++++
>> include/uapi/linux/io_uring.h | 6 ++-
>> io_uring/opdef.c | 1 +
>> io_uring/uring_cmd.c | 81 ++++++++++++++++++++++++++++++++++-
>> 4 files changed, 114 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
>> index cfa6d0c0c322..72832757f8ef 100644
>> --- a/include/linux/io_uring/cmd.h
>> +++ b/include/linux/io_uring/cmd.h
>> @@ -70,6 +70,22 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
>> /* Execute the request from a blocking context */
>> void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
>>
>> +/*
>> + * Select a buffer from the provided buffer group for multishot uring_cmd.
>> + * Returns the selected buffer address and size.
>> + */
>> +int io_uring_cmd_select_buffer(struct io_uring_cmd *ioucmd,
>> + unsigned buf_group,
>> + void **buf, size_t *len,
>> + unsigned int issue_flags);
>> +
>> +/*
>> + * Complete a multishot uring_cmd event. This will post a CQE to the completion
>> + * queue and update the provided buffer.
>> + */
>> +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
>> + ssize_t ret, unsigned int issue_flags);
>> +
>> #else
>> static inline int
>> io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
>> @@ -102,6 +118,18 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
>> static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
>> {
>> }
>> +static inline int io_uring_cmd_select_buffer(struct io_uring_cmd *ioucmd,
>> + unsigned buf_group,
>> + void **buf, size_t *len,
>> + unsigned int issue_flags)
>> +{
>> + return -EOPNOTSUPP;
>> +}
>> +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
>> + ssize_t ret, unsigned int issue_flags)
>> +{
>> + return true;
>> +}
>> #endif
>>
>> /*
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 6957dc539d83..1e935f8901c5 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -298,9 +298,13 @@ enum io_uring_op {
>> * sqe->uring_cmd_flags top 8bits aren't available for userspace
>> * IORING_URING_CMD_FIXED use registered buffer; pass this flag
>> * along with setting sqe->buf_index.
>> + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other
>> + * multishot commands. Not compatible with
>> + * IORING_URING_CMD_FIXED, for now.
>> */
>> #define IORING_URING_CMD_FIXED (1U << 0)
>> -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
>> +#define IORING_URING_CMD_MULTISHOT (1U << 1)
>> +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT)
>>
>>
>> /*
>> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
>> index 9568785810d9..932319633eac 100644
>> --- a/io_uring/opdef.c
>> +++ b/io_uring/opdef.c
>> @@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = {
>> #endif
>> },
>> [IORING_OP_URING_CMD] = {
>> + .buffer_select = 1,
>> .needs_file = 1,
>> .plug = 1,
>> .iopoll = 1,
>> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
>> index 053bac89b6c0..babb6a4b3542 100644
>> --- a/io_uring/uring_cmd.c
>> +++ b/io_uring/uring_cmd.c
>> @@ -11,6 +11,7 @@
>> #include "io_uring.h"
>> #include "alloc_cache.h"
>> #include "rsrc.h"
>> +#include "kbuf.h"
>> #include "uring_cmd.h"
>> #include "poll.h"
>>
>> @@ -194,8 +195,21 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> if (ioucmd->flags & ~IORING_URING_CMD_MASK)
>> return -EINVAL;
>>
>> - if (ioucmd->flags & IORING_URING_CMD_FIXED)
>> + if (ioucmd->flags & IORING_URING_CMD_FIXED) {
>> + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT)
>> + return -EINVAL;
>> req->buf_index = READ_ONCE(sqe->buf_index);
>> + }
>> +
>> + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) {
>> + if (ioucmd->flags & IORING_URING_CMD_FIXED)
>> + return -EINVAL;
>> + if (!(req->flags & REQ_F_BUFFER_SELECT))
>> + return -EINVAL;
>> + } else {
>> + if (req->flags & REQ_F_BUFFER_SELECT)
>> + return -EINVAL;
>> + }
>>
>> ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
>>
>> @@ -251,6 +265,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
>> }
>>
>> ret = file->f_op->uring_cmd(ioucmd, issue_flags);
>> + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) {
>> + if (ret >= 0)
>> + return IOU_ISSUE_SKIP_COMPLETE;
>> + io_kbuf_recycle(req, issue_flags);
>> + }
>> if (ret == -EAGAIN) {
>> ioucmd->flags |= IORING_URING_CMD_REISSUE;
>> return ret;
>> @@ -333,3 +352,63 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
>> return false;
>> return io_req_post_cqe32(req, cqe);
>> }
>> +
>> +/*
>> + * Work with io_uring_mshot_cmd_post_cqe() together for committing the
>> + * provided buffer upfront
>> + *
>> + * The two must be paired, and both must be called within the same
>> + * uring_cmd submission context.
>> + */
>> +int io_uring_cmd_select_buffer(struct io_uring_cmd *ioucmd,
>> + unsigned buf_group,
>> + void __user **buf, size_t *len,
>> + unsigned int issue_flags)
>> +{
>> + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
>> + void __user *ubuf;
>> +
>> + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
>> + return -EINVAL;
>> +
>> + if (WARN_ON_ONCE(!io_do_buffer_select(req)))
>> + return -EINVAL;
>> +
>> + ubuf = io_buffer_select(req, len, buf_group, issue_flags);
>> + if (!ubuf)
>> + return -ENOBUFS;
>> +
>> + *buf = ubuf;
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(io_uring_cmd_select_buffer);
>> +
>> +/*
>> + * Return true if this multishot uring_cmd needs to be completed, otherwise
>> + * the event CQE is posted successfully.
>> + *
>> + * This function must be paired with io_uring_cmd_select_buffer(), and both
>> + * must be called within the same uring_cmd submission context.
>> + */
>> +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
>> + ssize_t ret, unsigned int issue_flags)
>> +{
>> + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
>> + unsigned int cflags = 0;
>> +
>> + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
>> + return true;
>> +
>> + if (ret > 0) {
>> + cflags = io_put_kbuf(req, ret, issue_flags);
>> + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE))
>> + return false;
>> + }
>> +
>> + io_kbuf_recycle(req, issue_flags);
>> + if (ret < 0)
>> + req_set_fail(req);
>> + io_req_set_res(req, ret, cflags);
>> + return true;
>> +}
>> +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe);
>
> I posted the series I talked about, which wraps this in struct
> io_br_sel:
>
> https://lore.kernel.org/io-uring/20250820182601.442933-1-axboe@kernel.dk/T/#m1001f151693606f7a73b48ab939be5caf8d182c3
>
> and with that in mind, I don't think the CQE posting is the place to do
> it. Even before that series, I don't think it's the right spot. I'd keep
> that as just posting the CQE.
>
> In any case, on top of the current patches, you'd do something ala:
>
> struct io_br_sel sel;
>
> sel = io_uring_cmd_select_buffer(cmd, group, &len, issue_flags);
>
> and sel.addr would be your buffer on return, and sel would hold the
> buffer_list (or have it committed already and NULL) after the fact.
> sel.
>
> Then once you're ready to do so, call:
>
> cflags = io_put_kbuf(req, ret, sel.buf_list, issue_flags);
> io_uring_mshot_cmd_post_cqe(cmd, ret, cflags);
>
> in that same context. As mentioned, I'd keep that outside of the CQE
> posting, leaving that as:
>
> bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
> ssize_t ret, unsigned int cflags)
> {
> struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
>
> if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
> return true;
>
> if (ret > 0) {
> if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE))
> return false;
> }
>
> if (ret < 0)
> req_set_fail(req);
> io_req_set_res(req, ret, cflags);
> return true;
> }
>
> With that, it should be completely clear that the usage of
> io_buffer_list is always sane, and it forces you to do the right thing
> wrt how the buffer is retrieved and committed. This avoids any issues
> with the current patch in terms of buffer list UAF, and makes it so that
> future users of this helper is also forced to get it right rather than
> need to understand all the minute details around that.
>
> Hope that makes sense...
Something like this incremental on top of the current tree, where
obviously the kbuf.h io_br_sel move to io_uring_types.h is a separate
patch.
That:
- Gets rid of the recycle in the issue path. You can't recycle there,
have the caller that selected the buffer do the recycling there, if
needed.
- Naming io_uring_cmd_buffer_select() rather than
io_uring_cmd_select_buffer() to closer follow the internal API.
- io_uring_cmd_buffer_select() will now return with a NULL ->addr,
caller should just -ENOBUFS at that point. More of a stylistic
choice...
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 72832757f8ef..856d343b8e2a 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -74,17 +74,16 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
* Select a buffer from the provided buffer group for multishot uring_cmd.
* Returns the selected buffer address and size.
*/
-int io_uring_cmd_select_buffer(struct io_uring_cmd *ioucmd,
- unsigned buf_group,
- void **buf, size_t *len,
- unsigned int issue_flags);
+struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
+ unsigned buf_group, size_t *len,
+ unsigned int issue_flags);
/*
* Complete a multishot uring_cmd event. This will post a CQE to the completion
* queue and update the provided buffer.
*/
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
- ssize_t ret, unsigned int issue_flags);
+ struct io_br_sel *sel, unsigned int issue_flags);
#else
static inline int
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1d33984611bc..472a1e1383a4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -85,6 +85,24 @@ struct io_mapped_region {
unsigned flags;
};
+/*
+ * Return value from io_buffer_list selection, to avoid stashing it in
+ * struct io_kiocb. For legacy/classic provided buffers, keeping a reference
+ * across execution contexts are fine. But for ring provided buffers, the
+ * list may go away as soon as ->uring_lock is dropped. As the io_kiocb
+ * persists, it's better to just keep the buffer local for those cases.
+ */
+struct io_br_sel {
+ struct io_buffer_list *buf_list;
+ /*
+ * Some selection parts return the user address, others return an error.
+ */
+ union {
+ void __user *addr;
+ ssize_t val;
+ };
+};
+
/*
* Arbitrary limit, can be raised if need be
*/
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 32f73adbe1e9..ada382ff38d7 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -62,24 +62,6 @@ struct buf_sel_arg {
unsigned short partial_map;
};
-/*
- * Return value from io_buffer_list selection, to avoid stashing it in
- * struct io_kiocb. For legacy/classic provided buffers, keeping a reference
- * across execution contexts are fine. But for ring provided buffers, the
- * list may go away as soon as ->uring_lock is dropped. As the io_kiocb
- * persists, it's better to just keep the buffer local for those cases.
- */
-struct io_br_sel {
- struct io_buffer_list *buf_list;
- /*
- * Some selection parts return the user address, others return an error.
- */
- union {
- void __user *addr;
- ssize_t val;
- };
-};
-
struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned buf_group, unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index babb6a4b3542..688fd1fad2a8 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -268,7 +268,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) {
if (ret >= 0)
return IOU_ISSUE_SKIP_COMPLETE;
- io_kbuf_recycle(req, issue_flags);
}
if (ret == -EAGAIN) {
ioucmd->flags |= IORING_URING_CMD_REISSUE;
@@ -360,28 +359,21 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd,
* The two must be paired, and both must be called within the same
* uring_cmd submission context.
*/
-int io_uring_cmd_select_buffer(struct io_uring_cmd *ioucmd,
- unsigned buf_group,
- void __user **buf, size_t *len,
- unsigned int issue_flags)
+struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
+ unsigned buf_group, size_t *len,
+ unsigned int issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
- void __user *ubuf;
if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
- return -EINVAL;
+ return (struct io_br_sel) { .val = -EINVAL };
if (WARN_ON_ONCE(!io_do_buffer_select(req)))
- return -EINVAL;
-
- ubuf = io_buffer_select(req, len, buf_group, issue_flags);
- if (!ubuf)
- return -ENOBUFS;
+ return (struct io_br_sel) { .val = -EINVAL };
- *buf = ubuf;
- return 0;
+ return io_buffer_select(req, len, buf_group, issue_flags);
}
-EXPORT_SYMBOL_GPL(io_uring_cmd_select_buffer);
+EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select);
/*
* Return true if this multishot uring_cmd needs to be completed, otherwise
@@ -391,7 +383,7 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_select_buffer);
* must be called within the same uring_cmd submission context.
*/
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
- ssize_t ret, unsigned int issue_flags)
+ struct io_br_sel *sel, unsigned int issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
unsigned int cflags = 0;
@@ -399,16 +391,16 @@ bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT))
return true;
- if (ret > 0) {
- cflags = io_put_kbuf(req, ret, issue_flags);
- if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE))
+ if (sel->val > 0) {
+ cflags = io_put_kbuf(req, sel->val, sel->buf_list);
+ if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE))
return false;
}
- io_kbuf_recycle(req, issue_flags);
- if (ret < 0)
+ io_kbuf_recycle(req, sel->buf_list, issue_flags);
+ if (sel->val < 0)
req_set_fail(req);
- io_req_set_res(req, ret, cflags);
+ io_req_set_res(req, sel->val, cflags);
return true;
}
EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe);
--
Jens Axboe
prev parent reply other threads:[~2025-08-21 2:24 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-20 15:40 [PATCH V4] io_uring: uring_cmd: add multishot support Ming Lei
2025-08-20 19:17 ` Jens Axboe
2025-08-21 2:24 ` Jens Axboe [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=2a3e4ad9-9679-43cf-b368-237167c2f544@kernel.dk \
--to=axboe@kernel.dk \
--cc=asml.silence@gmail.com \
--cc=csander@purestorage.com \
--cc=io-uring@vger.kernel.org \
--cc=ming.lei@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox