From: Caleb Sander Mateos <csander@purestorage.com>
To: Keith Busch <kbusch@meta.com>
Cc: io-uring@vger.kernel.org, axboe@kernel.dk,
Keith Busch <kbusch@kernel.org>
Subject: Re: [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED
Date: Tue, 14 Oct 2025 15:33:19 -0700 [thread overview]
Message-ID: <CADUfDZqe7+M9dqxVxUmMo31S1EGVmOhwqfKGLJfR45Yb_BT+Fg@mail.gmail.com> (raw)
In-Reply-To: <20251013180011.134131-3-kbusch@meta.com>
On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> Normal rings support 64b SQEs for posting submissions, while certain
> features require the ring to be configured with IORING_SETUP_SQE128, as
> they need to convey more information per submission. This, in turn,
> makes ALL the SQEs be 128b in size. This is somewhat wasteful and
> inefficient, particularly when only certain SQEs need to be of the
> bigger variant.
>
> This adds support for setting up a ring with mixed SQE sizes, using
> IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
> may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
> will be set to a variante to indicate that this is the case. Any other
> non-128b opcode will assume the SQ's default size.
>
> SQEs on these types of mixed rings may also utilize NOP with skip
> success set. This can happen if the ring is one (small) SQE entry away
> from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
> contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
> case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
> set. The kernel will process this as a normal NOP and without posting a
> CQE.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> include/uapi/linux/io_uring.h | 8 ++++++++
> io_uring/fdinfo.c | 34 +++++++++++++++++++++++++++-------
> io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++----
> io_uring/io_uring.h | 14 ++------------
> io_uring/opdef.c | 26 ++++++++++++++++++++++++++
> io_uring/opdef.h | 2 ++
> io_uring/register.c | 2 +-
> io_uring/uring_cmd.c | 17 +++++++++++++++--
> 8 files changed, 112 insertions(+), 26 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 263bed13473ef..04797a9b76bc2 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit {
> */
> #define IORING_SETUP_CQE_MIXED (1U << 18)
>
> +/*
> + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have
> + * a 128b opcode.
> + */
> +#define IORING_SETUP_SQE_MIXED (1U << 19)
> +
> enum io_uring_op {
> IORING_OP_NOP,
> IORING_OP_READV,
> @@ -295,6 +301,8 @@ enum io_uring_op {
> IORING_OP_READV_FIXED,
> IORING_OP_WRITEV_FIXED,
> IORING_OP_PIPE,
> + IORING_OP_NOP128,
> + IORING_OP_URING_CMD128,
>
> /* this goes last, obviously */
> IORING_OP_LAST,
> diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
> index ff3364531c77b..d14d2e983b623 100644
> --- a/io_uring/fdinfo.c
> +++ b/io_uring/fdinfo.c
> @@ -14,6 +14,7 @@
> #include "fdinfo.h"
> #include "cancel.h"
> #include "rsrc.h"
> +#include "opdef.h"
>
> #ifdef CONFIG_NET_RX_BUSY_POLL
> static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
> @@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> unsigned int cq_head = READ_ONCE(r->cq.head);
> unsigned int cq_tail = READ_ONCE(r->cq.tail);
> unsigned int sq_shift = 0;
> - unsigned int sq_entries;
> int sq_pid = -1, sq_cpu = -1;
> u64 sq_total_time = 0, sq_work_time = 0;
> unsigned int i;
> @@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> seq_printf(m, "CqTail:\t%u\n", cq_tail);
> seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
> seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
> - sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
> - for (i = 0; i < sq_entries; i++) {
> - unsigned int entry = i + sq_head;
> + while (sq_head < sq_tail) {
> struct io_uring_sqe *sqe;
> unsigned int sq_idx;
> + bool sqe128 = false;
> + u8 opcode;
>
> if (ctx->flags & IORING_SETUP_NO_SQARRAY)
> break;
> - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
> + sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]);
> if (sq_idx > sq_mask)
> continue;
> +
> + opcode = READ_ONCE(sqe->opcode);
> sqe = &ctx->sq_sqes[sq_idx << sq_shift];
> + if (sq_shift)
> + sqe128 = true;
> + else if (io_issue_defs[opcode].is_128) {
> + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) {
> + seq_printf(m,
> + "%5u: invalid sqe, 128B entry on non-mixed sq\n",
> + sq_idx);
> + break;
> + }
> + if ((++sq_head & sq_mask) == 0) {
> + seq_printf(m,
> + "%5u: corrupted sqe, wrapping 128B entry\n",
> + sq_idx);
> + break;
> + }
> + sqe128 = true;
> + }
> seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
> "addr:0x%llx, rw_flags:0x%x, buf_index:%d "
> "user_data:%llu",
> - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
> + sq_idx, io_uring_get_opcode(opcode), sqe->fd,
> sqe->flags, (unsigned long long) sqe->off,
> (unsigned long long) sqe->addr, sqe->rw_flags,
> sqe->buf_index, sqe->user_data);
> - if (sq_shift) {
> + if (sqe128) {
> u64 *sqeb = (void *) (sqe + 1);
> int size = sizeof(struct io_uring_sqe) / sizeof(u64);
> int j;
> @@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> }
> }
> seq_printf(m, "\n");
> + sq_head++;
> }
> seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
> while (cq_head < cq_tail) {
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 820ef05276667..cd84eb4f2d4ca 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2151,7 +2151,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err)
> }
>
> static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> - const struct io_uring_sqe *sqe)
> + const struct io_uring_sqe *sqe, unsigned int *left)
> __must_hold(&ctx->uring_lock)
> {
> const struct io_issue_def *def;
> @@ -2177,6 +2177,22 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> opcode = array_index_nospec(opcode, IORING_OP_LAST);
>
> def = &io_issue_defs[opcode];
> + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) {
> + /*
> + * A 128b op on a non-128b SQ requires mixed SQE support as
> + * well as 2 contiguous entries.
> + */
> + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
> + !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
> + return io_init_fail_req(req, -EINVAL);
> + /*
> + * A 128b operation on a mixed SQ uses two entries, so we have
> + * to increment the head and decrement what's left.
> + */
> + ctx->cached_sq_head++;
> + (*left)--;
Hmm, io_submit_sqes() calls io_get_task_refs() at the start to
decrement cached_refs by the number of SQEs (counting 128-byte SQEs
twice) but io_put_task() only increments it once for each completed
request (counting 128-byte SQEs once). Does that mean there's a
refcount leak? Perhaps io_submit_sqes() or this block needs to
increment cached_refs to account for each 128-byte SQE?
Otherwise, this looks good to me.
Best,
Caleb
> + }
> +
> if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
> /* enforce forwards compatibility on users */
> if (sqe_flags & ~SQE_VALID_FLAGS)
> @@ -2286,13 +2302,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
> }
>
> static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
> - const struct io_uring_sqe *sqe)
> + const struct io_uring_sqe *sqe, unsigned int *left)
> __must_hold(&ctx->uring_lock)
> {
> struct io_submit_link *link = &ctx->submit_state.link;
> int ret;
>
> - ret = io_init_req(ctx, req, sqe);
> + ret = io_init_req(ctx, req, sqe, left);
> if (unlikely(ret))
> return io_submit_fail_init(sqe, req, ret);
>
> @@ -2444,7 +2460,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
> * Continue submitting even for sqe failure if the
> * ring was setup with IORING_SETUP_SUBMIT_ALL
> */
> - if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
> + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) &&
> !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
> left--;
> break;
> @@ -2789,6 +2805,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
> if (cq_entries < 2)
> return SIZE_MAX;
> }
> + if (flags & IORING_SETUP_SQE_MIXED) {
> + if (sq_entries < 2)
> + return SIZE_MAX;
> + }
>
> #ifdef CONFIG_SMP
> off = ALIGN(off, SMP_CACHE_BYTES);
> @@ -3715,6 +3735,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
> if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
> (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
> return -EINVAL;
> + /*
> + * Nonsensical to ask for SQE128 and mixed SQE support, it's not
> + * supported to post 64b SQEs on a ring setup with SQE128.
> + */
> + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) ==
> + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED))
> + return -EINVAL;
>
> return 0;
> }
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 46d9141d772a7..85ed8eb7df80c 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -54,7 +54,8 @@
> IORING_SETUP_REGISTERED_FD_ONLY |\
> IORING_SETUP_NO_SQARRAY |\
> IORING_SETUP_HYBRID_IOPOLL |\
> - IORING_SETUP_CQE_MIXED)
> + IORING_SETUP_CQE_MIXED |\
> + IORING_SETUP_SQE_MIXED)
>
> #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
> IORING_ENTER_SQ_WAKEUP |\
> @@ -578,17 +579,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
> io_req_task_work_add(req);
> }
>
> -/*
> - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
> - * slot.
> - */
> -static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
> -{
> - if (ctx->flags & IORING_SETUP_SQE128)
> - return 2 * sizeof(struct io_uring_sqe);
> - return sizeof(struct io_uring_sqe);
> -}
> -
> static inline bool io_file_can_poll(struct io_kiocb *req)
> {
> if (req->flags & REQ_F_CAN_POLL)
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 932319633eac2..df52d760240e4 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = {
> .prep = io_pipe_prep,
> .issue = io_pipe,
> },
> + [IORING_OP_NOP128] = {
> + .audit_skip = 1,
> + .iopoll = 1,
> + .is_128 = 1,
> + .prep = io_nop_prep,
> + .issue = io_nop,
> + },
> + [IORING_OP_URING_CMD128] = {
> + .buffer_select = 1,
> + .needs_file = 1,
> + .plug = 1,
> + .iopoll = 1,
> + .iopoll_queue = 1,
> + .is_128 = 1,
> + .async_size = sizeof(struct io_async_cmd),
> + .prep = io_uring_cmd_prep,
> + .issue = io_uring_cmd,
> + },
> };
>
> const struct io_cold_def io_cold_defs[] = {
> @@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = {
> [IORING_OP_PIPE] = {
> .name = "PIPE",
> },
> + [IORING_OP_NOP128] = {
> + .name = "NOP128",
> + },
> + [IORING_OP_URING_CMD128] = {
> + .name = "URING_CMD128",
> + .sqe_copy = io_uring_cmd_sqe_copy,
> + .cleanup = io_uring_cmd_cleanup,
> + },
> };
>
> const char *io_uring_get_opcode(u8 opcode)
> diff --git a/io_uring/opdef.h b/io_uring/opdef.h
> index c2f0907ed78cc..aa37846880ffd 100644
> --- a/io_uring/opdef.h
> +++ b/io_uring/opdef.h
> @@ -27,6 +27,8 @@ struct io_issue_def {
> unsigned iopoll_queue : 1;
> /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
> unsigned vectored : 1;
> + /* set to 1 if this opcode uses 128b sqes in a mixed sq */
> + unsigned is_128 : 1;
>
> /* size of async data needed, if any */
> unsigned short async_size;
> diff --git a/io_uring/register.c b/io_uring/register.c
> index 43f04c47522c0..e97d9cbba7111 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -395,7 +395,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
> #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
> #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
> IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
> - IORING_SETUP_CQE_MIXED)
> + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
>
> static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
> {
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index d1e3ba62ee8e8..a89b29cc5d199 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> return 0;
> }
>
> +/*
> + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
> + * slot.
> + */
> +static inline size_t uring_sqe_size(struct io_kiocb *req)
> +{
> + if (req->ctx->flags & IORING_SETUP_SQE128 ||
> + req->opcode == IORING_OP_URING_CMD128)
> + return 2 * sizeof(struct io_uring_sqe);
> + return sizeof(struct io_uring_sqe);
> +}
> +
> void io_uring_cmd_sqe_copy(struct io_kiocb *req)
> {
> struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> @@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req)
> /* Should not happen, as REQ_F_SQE_COPIED covers this */
> if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
> return;
> - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
> + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req));
> ioucmd->sqe = ac->sqes;
> }
>
> @@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
> if (ret)
> return ret;
>
> - if (ctx->flags & IORING_SETUP_SQE128)
> + if (ctx->flags & IORING_SETUP_SQE128 ||
> + req->opcode == IORING_OP_URING_CMD128)
> issue_flags |= IO_URING_F_SQE128;
> if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED))
> issue_flags |= IO_URING_F_CQE32;
> --
> 2.47.3
>
next prev parent reply other threads:[~2025-10-14 22:33 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
2025-10-13 18:00 ` [PATCHv5 0/1] io_uring: mixed submission queue entries sizes Keith Busch
2025-10-13 18:00 ` [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED Keith Busch
2025-10-14 22:33 ` Caleb Sander Mateos [this message]
2025-10-15 2:03 ` Keith Busch
2025-10-16 18:06 ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 1/4] liburing: provide uring_cmd prep function Keith Busch
2025-10-19 16:24 ` Caleb Sander Mateos
2025-10-21 16:45 ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 2/4] Add support IORING_SETUP_SQE_MIXED Keith Busch
2025-10-13 18:00 ` [PATCHv5 3/4] Add nop testing for IORING_SETUP_SQE_MIXED Keith Busch
2025-10-13 18:00 ` [PATCHv5 4/4] Add mixed sqe test for uring commands Keith Busch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CADUfDZqe7+M9dqxVxUmMo31S1EGVmOhwqfKGLJfR45Yb_BT+Fg@mail.gmail.com \
--to=csander@purestorage.com \
--cc=axboe@kernel.dk \
--cc=io-uring@vger.kernel.org \
--cc=kbusch@kernel.org \
--cc=kbusch@meta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox