public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Caleb Sander Mateos <csander@purestorage.com>
To: Keith Busch <kbusch@meta.com>
Cc: io-uring@vger.kernel.org, axboe@kernel.dk,
	Keith Busch <kbusch@kernel.org>
Subject: Re: [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED
Date: Tue, 14 Oct 2025 15:33:19 -0700	[thread overview]
Message-ID: <CADUfDZqe7+M9dqxVxUmMo31S1EGVmOhwqfKGLJfR45Yb_BT+Fg@mail.gmail.com> (raw)
In-Reply-To: <20251013180011.134131-3-kbusch@meta.com>

On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> Normal rings support 64b SQEs for posting submissions, while certain
> features require the ring to be configured with IORING_SETUP_SQE128, as
> they need to convey more information per submission. This, in turn,
> makes ALL the SQEs be 128b in size. This is somewhat wasteful and
> inefficient, particularly when only certain SQEs need to be of the
> bigger variant.
>
> This adds support for setting up a ring with mixed SQE sizes, using
> IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
> may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
> will be set to a variante to indicate that this is the case. Any other
> non-128b opcode will assume the SQ's default size.
>
> SQEs on these types of mixed rings may also utilize NOP with skip
> success set.  This can happen if the ring is one (small) SQE entry away
> from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
> contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
> case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
> set. The kernel will process this as a normal NOP and without posting a
> CQE.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  include/uapi/linux/io_uring.h |  8 ++++++++
>  io_uring/fdinfo.c             | 34 +++++++++++++++++++++++++++-------
>  io_uring/io_uring.c           | 35 +++++++++++++++++++++++++++++++----
>  io_uring/io_uring.h           | 14 ++------------
>  io_uring/opdef.c              | 26 ++++++++++++++++++++++++++
>  io_uring/opdef.h              |  2 ++
>  io_uring/register.c           |  2 +-
>  io_uring/uring_cmd.c          | 17 +++++++++++++++--
>  8 files changed, 112 insertions(+), 26 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 263bed13473ef..04797a9b76bc2 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit {
>   */
>  #define IORING_SETUP_CQE_MIXED         (1U << 18)
>
> +/*
> + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have
> + * a 128b opcode.
> + */
> +#define IORING_SETUP_SQE_MIXED         (1U << 19)
> +
>  enum io_uring_op {
>         IORING_OP_NOP,
>         IORING_OP_READV,
> @@ -295,6 +301,8 @@ enum io_uring_op {
>         IORING_OP_READV_FIXED,
>         IORING_OP_WRITEV_FIXED,
>         IORING_OP_PIPE,
> +       IORING_OP_NOP128,
> +       IORING_OP_URING_CMD128,
>
>         /* this goes last, obviously */
>         IORING_OP_LAST,
> diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
> index ff3364531c77b..d14d2e983b623 100644
> --- a/io_uring/fdinfo.c
> +++ b/io_uring/fdinfo.c
> @@ -14,6 +14,7 @@
>  #include "fdinfo.h"
>  #include "cancel.h"
>  #include "rsrc.h"
> +#include "opdef.h"
>
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>  static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
> @@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>         unsigned int cq_head = READ_ONCE(r->cq.head);
>         unsigned int cq_tail = READ_ONCE(r->cq.tail);
>         unsigned int sq_shift = 0;
> -       unsigned int sq_entries;
>         int sq_pid = -1, sq_cpu = -1;
>         u64 sq_total_time = 0, sq_work_time = 0;
>         unsigned int i;
> @@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>         seq_printf(m, "CqTail:\t%u\n", cq_tail);
>         seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
>         seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
> -       sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
> -       for (i = 0; i < sq_entries; i++) {
> -               unsigned int entry = i + sq_head;
> +       while (sq_head < sq_tail) {
>                 struct io_uring_sqe *sqe;
>                 unsigned int sq_idx;
> +               bool sqe128 = false;
> +               u8 opcode;
>
>                 if (ctx->flags & IORING_SETUP_NO_SQARRAY)
>                         break;
> -               sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
> +               sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]);
>                 if (sq_idx > sq_mask)
>                         continue;
> +
> +               opcode = READ_ONCE(sqe->opcode);
>                 sqe = &ctx->sq_sqes[sq_idx << sq_shift];
> +               if (sq_shift)
> +                       sqe128 = true;
> +               else if (io_issue_defs[opcode].is_128) {
> +                       if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) {
> +                               seq_printf(m,
> +                                       "%5u: invalid sqe, 128B entry on non-mixed sq\n",
> +                                       sq_idx);
> +                               break;
> +                       }
> +                       if ((++sq_head & sq_mask) == 0) {
> +                               seq_printf(m,
> +                                       "%5u: corrupted sqe, wrapping 128B entry\n",
> +                                       sq_idx);
> +                               break;
> +                       }
> +                       sqe128 = true;
> +               }
>                 seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
>                               "addr:0x%llx, rw_flags:0x%x, buf_index:%d "
>                               "user_data:%llu",
> -                          sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
> +                          sq_idx, io_uring_get_opcode(opcode), sqe->fd,
>                            sqe->flags, (unsigned long long) sqe->off,
>                            (unsigned long long) sqe->addr, sqe->rw_flags,
>                            sqe->buf_index, sqe->user_data);
> -               if (sq_shift) {
> +               if (sqe128) {
>                         u64 *sqeb = (void *) (sqe + 1);
>                         int size = sizeof(struct io_uring_sqe) / sizeof(u64);
>                         int j;
> @@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>                         }
>                 }
>                 seq_printf(m, "\n");
> +               sq_head++;
>         }
>         seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
>         while (cq_head < cq_tail) {
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 820ef05276667..cd84eb4f2d4ca 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2151,7 +2151,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err)
>  }
>
>  static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> -                      const struct io_uring_sqe *sqe)
> +                      const struct io_uring_sqe *sqe, unsigned int *left)
>         __must_hold(&ctx->uring_lock)
>  {
>         const struct io_issue_def *def;
> @@ -2177,6 +2177,22 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>         opcode = array_index_nospec(opcode, IORING_OP_LAST);
>
>         def = &io_issue_defs[opcode];
> +       if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) {
> +               /*
> +                * A 128b op on a non-128b SQ requires mixed SQE support as
> +                * well as 2 contiguous entries.
> +                */
> +               if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
> +                   !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
> +                       return io_init_fail_req(req, -EINVAL);
> +               /*
> +                * A 128b operation on a mixed SQ uses two entries, so we have
> +                * to increment the head and decrement what's left.
> +                */
> +               ctx->cached_sq_head++;
> +               (*left)--;

Hmm, io_submit_sqes() calls io_get_task_refs() at the start to
decrement cached_refs by the number of SQEs (counting 128-byte SQEs
twice) but io_put_task() only increments it once for each completed
request (counting 128-byte SQEs once). Does that mean there's a
refcount leak? Perhaps io_submit_sqes() or this block needs to
increment cached_refs to account for each 128-byte SQE?

Otherwise, this looks good to me.

Best,
Caleb


> +       }
> +
>         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
>                 /* enforce forwards compatibility on users */
>                 if (sqe_flags & ~SQE_VALID_FLAGS)
> @@ -2286,13 +2302,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
>  }
>
>  static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
> -                        const struct io_uring_sqe *sqe)
> +                        const struct io_uring_sqe *sqe, unsigned int *left)
>         __must_hold(&ctx->uring_lock)
>  {
>         struct io_submit_link *link = &ctx->submit_state.link;
>         int ret;
>
> -       ret = io_init_req(ctx, req, sqe);
> +       ret = io_init_req(ctx, req, sqe, left);
>         if (unlikely(ret))
>                 return io_submit_fail_init(sqe, req, ret);
>
> @@ -2444,7 +2460,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>                  * Continue submitting even for sqe failure if the
>                  * ring was setup with IORING_SETUP_SUBMIT_ALL
>                  */
> -               if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
> +               if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) &&
>                     !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
>                         left--;
>                         break;
> @@ -2789,6 +2805,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
>                 if (cq_entries < 2)
>                         return SIZE_MAX;
>         }
> +       if (flags & IORING_SETUP_SQE_MIXED) {
> +               if (sq_entries < 2)
> +                       return SIZE_MAX;
> +       }
>
>  #ifdef CONFIG_SMP
>         off = ALIGN(off, SMP_CACHE_BYTES);
> @@ -3715,6 +3735,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
>         if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
>             (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
>                 return -EINVAL;
> +       /*
> +        * Nonsensical to ask for SQE128 and mixed SQE support, it's not
> +        * supported to post 64b SQEs on a ring setup with SQE128.
> +        */
> +       if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) ==
> +           (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED))
> +               return -EINVAL;
>
>         return 0;
>  }
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 46d9141d772a7..85ed8eb7df80c 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -54,7 +54,8 @@
>                         IORING_SETUP_REGISTERED_FD_ONLY |\
>                         IORING_SETUP_NO_SQARRAY |\
>                         IORING_SETUP_HYBRID_IOPOLL |\
> -                       IORING_SETUP_CQE_MIXED)
> +                       IORING_SETUP_CQE_MIXED |\
> +                       IORING_SETUP_SQE_MIXED)
>
>  #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
>                         IORING_ENTER_SQ_WAKEUP |\
> @@ -578,17 +579,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
>         io_req_task_work_add(req);
>  }
>
> -/*
> - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
> - * slot.
> - */
> -static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
> -{
> -       if (ctx->flags & IORING_SETUP_SQE128)
> -               return 2 * sizeof(struct io_uring_sqe);
> -       return sizeof(struct io_uring_sqe);
> -}
> -
>  static inline bool io_file_can_poll(struct io_kiocb *req)
>  {
>         if (req->flags & REQ_F_CAN_POLL)
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 932319633eac2..df52d760240e4 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = {
>                 .prep                   = io_pipe_prep,
>                 .issue                  = io_pipe,
>         },
> +       [IORING_OP_NOP128] = {
> +               .audit_skip             = 1,
> +               .iopoll                 = 1,
> +               .is_128                 = 1,
> +               .prep                   = io_nop_prep,
> +               .issue                  = io_nop,
> +       },
> +       [IORING_OP_URING_CMD128] = {
> +               .buffer_select          = 1,
> +               .needs_file             = 1,
> +               .plug                   = 1,
> +               .iopoll                 = 1,
> +               .iopoll_queue           = 1,
> +               .is_128                 = 1,
> +               .async_size             = sizeof(struct io_async_cmd),
> +               .prep                   = io_uring_cmd_prep,
> +               .issue                  = io_uring_cmd,
> +       },
>  };
>
>  const struct io_cold_def io_cold_defs[] = {
> @@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = {
>         [IORING_OP_PIPE] = {
>                 .name                   = "PIPE",
>         },
> +       [IORING_OP_NOP128] = {
> +               .name                   = "NOP128",
> +       },
> +       [IORING_OP_URING_CMD128] = {
> +               .name                   = "URING_CMD128",
> +               .sqe_copy               = io_uring_cmd_sqe_copy,
> +               .cleanup                = io_uring_cmd_cleanup,
> +       },
>  };
>
>  const char *io_uring_get_opcode(u8 opcode)
> diff --git a/io_uring/opdef.h b/io_uring/opdef.h
> index c2f0907ed78cc..aa37846880ffd 100644
> --- a/io_uring/opdef.h
> +++ b/io_uring/opdef.h
> @@ -27,6 +27,8 @@ struct io_issue_def {
>         unsigned                iopoll_queue : 1;
>         /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
>         unsigned                vectored : 1;
> +       /* set to 1 if this opcode uses 128b sqes in a mixed sq */
> +       unsigned                is_128 : 1;
>
>         /* size of async data needed, if any */
>         unsigned short          async_size;
> diff --git a/io_uring/register.c b/io_uring/register.c
> index 43f04c47522c0..e97d9cbba7111 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -395,7 +395,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
>  #define RESIZE_FLAGS   (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
>  #define COPY_FLAGS     (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
>                          IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
> -                        IORING_SETUP_CQE_MIXED)
> +                        IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
>
>  static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
>  {
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index d1e3ba62ee8e8..a89b29cc5d199 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>         return 0;
>  }
>
> +/*
> + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
> + * slot.
> + */
> +static inline size_t uring_sqe_size(struct io_kiocb *req)
> +{
> +       if (req->ctx->flags & IORING_SETUP_SQE128 ||
> +           req->opcode == IORING_OP_URING_CMD128)
> +               return 2 * sizeof(struct io_uring_sqe);
> +       return sizeof(struct io_uring_sqe);
> +}
> +
>  void io_uring_cmd_sqe_copy(struct io_kiocb *req)
>  {
>         struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> @@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req)
>         /* Should not happen, as REQ_F_SQE_COPIED covers this */
>         if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
>                 return;
> -       memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
> +       memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req));
>         ioucmd->sqe = ac->sqes;
>  }
>
> @@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
>         if (ret)
>                 return ret;
>
> -       if (ctx->flags & IORING_SETUP_SQE128)
> +       if (ctx->flags & IORING_SETUP_SQE128 ||
> +           req->opcode == IORING_OP_URING_CMD128)
>                 issue_flags |= IO_URING_F_SQE128;
>         if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED))
>                 issue_flags |= IO_URING_F_CQE32;
> --
> 2.47.3
>

  reply	other threads:[~2025-10-14 22:33 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
2025-10-13 18:00 ` [PATCHv5 0/1] io_uring: mixed submission queue entries sizes Keith Busch
2025-10-13 18:00 ` [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED Keith Busch
2025-10-14 22:33   ` Caleb Sander Mateos [this message]
2025-10-15  2:03     ` Keith Busch
2025-10-16 18:06       ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 1/4] liburing: provide uring_cmd prep function Keith Busch
2025-10-19 16:24   ` Caleb Sander Mateos
2025-10-21 16:45     ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 2/4] Add support IORING_SETUP_SQE_MIXED Keith Busch
2025-10-13 18:00 ` [PATCHv5 3/4] Add nop testing for IORING_SETUP_SQE_MIXED Keith Busch
2025-10-13 18:00 ` [PATCHv5 4/4] Add mixed sqe test for uring commands Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CADUfDZqe7+M9dqxVxUmMo31S1EGVmOhwqfKGLJfR45Yb_BT+Fg@mail.gmail.com \
    --to=csander@purestorage.com \
    --cc=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    --cc=kbusch@kernel.org \
    --cc=kbusch@meta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox