* [PATCHv5 0/1] io_uring: mixed submission queue entries sizes
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
@ 2025-10-13 18:00 ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED Keith Busch
` (4 subsequent siblings)
5 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2025-10-13 18:00 UTC (permalink / raw)
To: io-uring, axboe, csander; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Resending to the correct mailing list.
Changes from v3:
- Allowed 128b opcodes on both big and mixed SQ's
- Added additional comments for clarity
- Commit message fixups
- Moved the uring specific entry size fucntion to the uring code.
Keith Busch (1):
io_uring: add support for IORING_SETUP_SQE_MIXED
include/uapi/linux/io_uring.h | 8 ++++++++
io_uring/fdinfo.c | 34 +++++++++++++++++++++++++++-------
io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++----
io_uring/io_uring.h | 14 ++------------
io_uring/opdef.c | 26 ++++++++++++++++++++++++++
io_uring/opdef.h | 2 ++
io_uring/register.c | 2 +-
io_uring/uring_cmd.c | 17 +++++++++++++++--
8 files changed, 112 insertions(+), 26 deletions(-)
--
2.47.3
^ permalink raw reply [flat|nested] 12+ messages in thread* [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
2025-10-13 18:00 ` [PATCHv5 0/1] io_uring: mixed submission queue entries sizes Keith Busch
@ 2025-10-13 18:00 ` Keith Busch
2025-10-14 22:33 ` Caleb Sander Mateos
2025-10-13 18:00 ` [PATCHv5 1/4] liburing: provide uring_cmd prep function Keith Busch
` (3 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2025-10-13 18:00 UTC (permalink / raw)
To: io-uring, axboe, csander; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Normal rings support 64b SQEs for posting submissions, while certain
features require the ring to be configured with IORING_SETUP_SQE128, as
they need to convey more information per submission. This, in turn,
makes ALL the SQEs be 128b in size. This is somewhat wasteful and
inefficient, particularly when only certain SQEs need to be of the
bigger variant.
This adds support for setting up a ring with mixed SQE sizes, using
IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
will be set to a variante to indicate that this is the case. Any other
non-128b opcode will assume the SQ's default size.
SQEs on these types of mixed rings may also utilize NOP with skip
success set. This can happen if the ring is one (small) SQE entry away
from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
set. The kernel will process this as a normal NOP and without posting a
CQE.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/uapi/linux/io_uring.h | 8 ++++++++
io_uring/fdinfo.c | 34 +++++++++++++++++++++++++++-------
io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++----
io_uring/io_uring.h | 14 ++------------
io_uring/opdef.c | 26 ++++++++++++++++++++++++++
io_uring/opdef.h | 2 ++
io_uring/register.c | 2 +-
io_uring/uring_cmd.c | 17 +++++++++++++++--
8 files changed, 112 insertions(+), 26 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 263bed13473ef..04797a9b76bc2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit {
*/
#define IORING_SETUP_CQE_MIXED (1U << 18)
+/*
+ * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have
+ * a 128b opcode.
+ */
+#define IORING_SETUP_SQE_MIXED (1U << 19)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -295,6 +301,8 @@ enum io_uring_op {
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,
IORING_OP_PIPE,
+ IORING_OP_NOP128,
+ IORING_OP_URING_CMD128,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77b..d14d2e983b623 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -14,6 +14,7 @@
#include "fdinfo.h"
#include "cancel.h"
#include "rsrc.h"
+#include "opdef.h"
#ifdef CONFIG_NET_RX_BUSY_POLL
static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
@@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
unsigned int sq_shift = 0;
- unsigned int sq_entries;
int sq_pid = -1, sq_cpu = -1;
u64 sq_total_time = 0, sq_work_time = 0;
unsigned int i;
@@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "CqTail:\t%u\n", cq_tail);
seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
- sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
- for (i = 0; i < sq_entries; i++) {
- unsigned int entry = i + sq_head;
+ while (sq_head < sq_tail) {
struct io_uring_sqe *sqe;
unsigned int sq_idx;
+ bool sqe128 = false;
+ u8 opcode;
if (ctx->flags & IORING_SETUP_NO_SQARRAY)
break;
- sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]);
if (sq_idx > sq_mask)
continue;
+
+ opcode = READ_ONCE(sqe->opcode);
sqe = &ctx->sq_sqes[sq_idx << sq_shift];
+ if (sq_shift)
+ sqe128 = true;
+ else if (io_issue_defs[opcode].is_128) {
+ if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) {
+ seq_printf(m,
+ "%5u: invalid sqe, 128B entry on non-mixed sq\n",
+ sq_idx);
+ break;
+ }
+ if ((++sq_head & sq_mask) == 0) {
+ seq_printf(m,
+ "%5u: corrupted sqe, wrapping 128B entry\n",
+ sq_idx);
+ break;
+ }
+ sqe128 = true;
+ }
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
"addr:0x%llx, rw_flags:0x%x, buf_index:%d "
"user_data:%llu",
- sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
+ sq_idx, io_uring_get_opcode(opcode), sqe->fd,
sqe->flags, (unsigned long long) sqe->off,
(unsigned long long) sqe->addr, sqe->rw_flags,
sqe->buf_index, sqe->user_data);
- if (sq_shift) {
+ if (sqe128) {
u64 *sqeb = (void *) (sqe + 1);
int size = sizeof(struct io_uring_sqe) / sizeof(u64);
int j;
@@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
}
}
seq_printf(m, "\n");
+ sq_head++;
}
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
while (cq_head < cq_tail) {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 820ef05276667..cd84eb4f2d4ca 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2151,7 +2151,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err)
}
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ const struct io_uring_sqe *sqe, unsigned int *left)
__must_hold(&ctx->uring_lock)
{
const struct io_issue_def *def;
@@ -2177,6 +2177,22 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
opcode = array_index_nospec(opcode, IORING_OP_LAST);
def = &io_issue_defs[opcode];
+ if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) {
+ /*
+ * A 128b op on a non-128b SQ requires mixed SQE support as
+ * well as 2 contiguous entries.
+ */
+ if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
+ !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
+ return io_init_fail_req(req, -EINVAL);
+ /*
+ * A 128b operation on a mixed SQ uses two entries, so we have
+ * to increment the head and decrement what's left.
+ */
+ ctx->cached_sq_head++;
+ (*left)--;
+ }
+
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
@@ -2286,13 +2302,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
}
static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ const struct io_uring_sqe *sqe, unsigned int *left)
__must_hold(&ctx->uring_lock)
{
struct io_submit_link *link = &ctx->submit_state.link;
int ret;
- ret = io_init_req(ctx, req, sqe);
+ ret = io_init_req(ctx, req, sqe, left);
if (unlikely(ret))
return io_submit_fail_init(sqe, req, ret);
@@ -2444,7 +2460,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
* Continue submitting even for sqe failure if the
* ring was setup with IORING_SETUP_SUBMIT_ALL
*/
- if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+ if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) &&
!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
left--;
break;
@@ -2789,6 +2805,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
if (cq_entries < 2)
return SIZE_MAX;
}
+ if (flags & IORING_SETUP_SQE_MIXED) {
+ if (sq_entries < 2)
+ return SIZE_MAX;
+ }
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
@@ -3715,6 +3735,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
(IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
return -EINVAL;
+ /*
+ * Nonsensical to ask for SQE128 and mixed SQE support, it's not
+ * supported to post 64b SQEs on a ring setup with SQE128.
+ */
+ if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) ==
+ (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED))
+ return -EINVAL;
return 0;
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 46d9141d772a7..85ed8eb7df80c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -54,7 +54,8 @@
IORING_SETUP_REGISTERED_FD_ONLY |\
IORING_SETUP_NO_SQARRAY |\
IORING_SETUP_HYBRID_IOPOLL |\
- IORING_SETUP_CQE_MIXED)
+ IORING_SETUP_CQE_MIXED |\
+ IORING_SETUP_SQE_MIXED)
#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
IORING_ENTER_SQ_WAKEUP |\
@@ -578,17 +579,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
io_req_task_work_add(req);
}
-/*
- * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
- * slot.
- */
-static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
-{
- if (ctx->flags & IORING_SETUP_SQE128)
- return 2 * sizeof(struct io_uring_sqe);
- return sizeof(struct io_uring_sqe);
-}
-
static inline bool io_file_can_poll(struct io_kiocb *req)
{
if (req->flags & REQ_F_CAN_POLL)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 932319633eac2..df52d760240e4 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_pipe_prep,
.issue = io_pipe,
},
+ [IORING_OP_NOP128] = {
+ .audit_skip = 1,
+ .iopoll = 1,
+ .is_128 = 1,
+ .prep = io_nop_prep,
+ .issue = io_nop,
+ },
+ [IORING_OP_URING_CMD128] = {
+ .buffer_select = 1,
+ .needs_file = 1,
+ .plug = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .is_128 = 1,
+ .async_size = sizeof(struct io_async_cmd),
+ .prep = io_uring_cmd_prep,
+ .issue = io_uring_cmd,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_PIPE] = {
.name = "PIPE",
},
+ [IORING_OP_NOP128] = {
+ .name = "NOP128",
+ },
+ [IORING_OP_URING_CMD128] = {
+ .name = "URING_CMD128",
+ .sqe_copy = io_uring_cmd_sqe_copy,
+ .cleanup = io_uring_cmd_cleanup,
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index c2f0907ed78cc..aa37846880ffd 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -27,6 +27,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
+ /* set to 1 if this opcode uses 128b sqes in a mixed sq */
+ unsigned is_128 : 1;
/* size of async data needed, if any */
unsigned short async_size;
diff --git a/io_uring/register.c b/io_uring/register.c
index 43f04c47522c0..e97d9cbba7111 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -395,7 +395,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
- IORING_SETUP_CQE_MIXED)
+ IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
{
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index d1e3ba62ee8e8..a89b29cc5d199 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
+/*
+ * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
+ * slot.
+ */
+static inline size_t uring_sqe_size(struct io_kiocb *req)
+{
+ if (req->ctx->flags & IORING_SETUP_SQE128 ||
+ req->opcode == IORING_OP_URING_CMD128)
+ return 2 * sizeof(struct io_uring_sqe);
+ return sizeof(struct io_uring_sqe);
+}
+
void io_uring_cmd_sqe_copy(struct io_kiocb *req)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req)
/* Should not happen, as REQ_F_SQE_COPIED covers this */
if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
return;
- memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
+ memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req));
ioucmd->sqe = ac->sqes;
}
@@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
if (ret)
return ret;
- if (ctx->flags & IORING_SETUP_SQE128)
+ if (ctx->flags & IORING_SETUP_SQE128 ||
+ req->opcode == IORING_OP_URING_CMD128)
issue_flags |= IO_URING_F_SQE128;
if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED))
issue_flags |= IO_URING_F_CQE32;
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED
2025-10-13 18:00 ` [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED Keith Busch
@ 2025-10-14 22:33 ` Caleb Sander Mateos
2025-10-15 2:03 ` Keith Busch
0 siblings, 1 reply; 12+ messages in thread
From: Caleb Sander Mateos @ 2025-10-14 22:33 UTC (permalink / raw)
To: Keith Busch; +Cc: io-uring, axboe, Keith Busch
On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> Normal rings support 64b SQEs for posting submissions, while certain
> features require the ring to be configured with IORING_SETUP_SQE128, as
> they need to convey more information per submission. This, in turn,
> makes ALL the SQEs be 128b in size. This is somewhat wasteful and
> inefficient, particularly when only certain SQEs need to be of the
> bigger variant.
>
> This adds support for setting up a ring with mixed SQE sizes, using
> IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
> may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
> will be set to a variante to indicate that this is the case. Any other
> non-128b opcode will assume the SQ's default size.
>
> SQEs on these types of mixed rings may also utilize NOP with skip
> success set. This can happen if the ring is one (small) SQE entry away
> from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
> contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
> case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
> set. The kernel will process this as a normal NOP and without posting a
> CQE.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> include/uapi/linux/io_uring.h | 8 ++++++++
> io_uring/fdinfo.c | 34 +++++++++++++++++++++++++++-------
> io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++----
> io_uring/io_uring.h | 14 ++------------
> io_uring/opdef.c | 26 ++++++++++++++++++++++++++
> io_uring/opdef.h | 2 ++
> io_uring/register.c | 2 +-
> io_uring/uring_cmd.c | 17 +++++++++++++++--
> 8 files changed, 112 insertions(+), 26 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 263bed13473ef..04797a9b76bc2 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit {
> */
> #define IORING_SETUP_CQE_MIXED (1U << 18)
>
> +/*
> + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have
> + * a 128b opcode.
> + */
> +#define IORING_SETUP_SQE_MIXED (1U << 19)
> +
> enum io_uring_op {
> IORING_OP_NOP,
> IORING_OP_READV,
> @@ -295,6 +301,8 @@ enum io_uring_op {
> IORING_OP_READV_FIXED,
> IORING_OP_WRITEV_FIXED,
> IORING_OP_PIPE,
> + IORING_OP_NOP128,
> + IORING_OP_URING_CMD128,
>
> /* this goes last, obviously */
> IORING_OP_LAST,
> diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
> index ff3364531c77b..d14d2e983b623 100644
> --- a/io_uring/fdinfo.c
> +++ b/io_uring/fdinfo.c
> @@ -14,6 +14,7 @@
> #include "fdinfo.h"
> #include "cancel.h"
> #include "rsrc.h"
> +#include "opdef.h"
>
> #ifdef CONFIG_NET_RX_BUSY_POLL
> static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
> @@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> unsigned int cq_head = READ_ONCE(r->cq.head);
> unsigned int cq_tail = READ_ONCE(r->cq.tail);
> unsigned int sq_shift = 0;
> - unsigned int sq_entries;
> int sq_pid = -1, sq_cpu = -1;
> u64 sq_total_time = 0, sq_work_time = 0;
> unsigned int i;
> @@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> seq_printf(m, "CqTail:\t%u\n", cq_tail);
> seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail));
> seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
> - sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
> - for (i = 0; i < sq_entries; i++) {
> - unsigned int entry = i + sq_head;
> + while (sq_head < sq_tail) {
> struct io_uring_sqe *sqe;
> unsigned int sq_idx;
> + bool sqe128 = false;
> + u8 opcode;
>
> if (ctx->flags & IORING_SETUP_NO_SQARRAY)
> break;
> - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
> + sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]);
> if (sq_idx > sq_mask)
> continue;
> +
> + opcode = READ_ONCE(sqe->opcode);
> sqe = &ctx->sq_sqes[sq_idx << sq_shift];
> + if (sq_shift)
> + sqe128 = true;
> + else if (io_issue_defs[opcode].is_128) {
> + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) {
> + seq_printf(m,
> + "%5u: invalid sqe, 128B entry on non-mixed sq\n",
> + sq_idx);
> + break;
> + }
> + if ((++sq_head & sq_mask) == 0) {
> + seq_printf(m,
> + "%5u: corrupted sqe, wrapping 128B entry\n",
> + sq_idx);
> + break;
> + }
> + sqe128 = true;
> + }
> seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
> "addr:0x%llx, rw_flags:0x%x, buf_index:%d "
> "user_data:%llu",
> - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
> + sq_idx, io_uring_get_opcode(opcode), sqe->fd,
> sqe->flags, (unsigned long long) sqe->off,
> (unsigned long long) sqe->addr, sqe->rw_flags,
> sqe->buf_index, sqe->user_data);
> - if (sq_shift) {
> + if (sqe128) {
> u64 *sqeb = (void *) (sqe + 1);
> int size = sizeof(struct io_uring_sqe) / sizeof(u64);
> int j;
> @@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> }
> }
> seq_printf(m, "\n");
> + sq_head++;
> }
> seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
> while (cq_head < cq_tail) {
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 820ef05276667..cd84eb4f2d4ca 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2151,7 +2151,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err)
> }
>
> static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> - const struct io_uring_sqe *sqe)
> + const struct io_uring_sqe *sqe, unsigned int *left)
> __must_hold(&ctx->uring_lock)
> {
> const struct io_issue_def *def;
> @@ -2177,6 +2177,22 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
> opcode = array_index_nospec(opcode, IORING_OP_LAST);
>
> def = &io_issue_defs[opcode];
> + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) {
> + /*
> + * A 128b op on a non-128b SQ requires mixed SQE support as
> + * well as 2 contiguous entries.
> + */
> + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
> + !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
> + return io_init_fail_req(req, -EINVAL);
> + /*
> + * A 128b operation on a mixed SQ uses two entries, so we have
> + * to increment the head and decrement what's left.
> + */
> + ctx->cached_sq_head++;
> + (*left)--;
Hmm, io_submit_sqes() calls io_get_task_refs() at the start to
decrement cached_refs by the number of SQEs (counting 128-byte SQEs
twice) but io_put_task() only increments it once for each completed
request (counting 128-byte SQEs once). Does that mean there's a
refcount leak? Perhaps io_submit_sqes() or this block needs to
increment cached_refs to account for each 128-byte SQE?
Otherwise, this looks good to me.
Best,
Caleb
> + }
> +
> if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
> /* enforce forwards compatibility on users */
> if (sqe_flags & ~SQE_VALID_FLAGS)
> @@ -2286,13 +2302,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
> }
>
> static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
> - const struct io_uring_sqe *sqe)
> + const struct io_uring_sqe *sqe, unsigned int *left)
> __must_hold(&ctx->uring_lock)
> {
> struct io_submit_link *link = &ctx->submit_state.link;
> int ret;
>
> - ret = io_init_req(ctx, req, sqe);
> + ret = io_init_req(ctx, req, sqe, left);
> if (unlikely(ret))
> return io_submit_fail_init(sqe, req, ret);
>
> @@ -2444,7 +2460,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
> * Continue submitting even for sqe failure if the
> * ring was setup with IORING_SETUP_SUBMIT_ALL
> */
> - if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
> + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) &&
> !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
> left--;
> break;
> @@ -2789,6 +2805,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
> if (cq_entries < 2)
> return SIZE_MAX;
> }
> + if (flags & IORING_SETUP_SQE_MIXED) {
> + if (sq_entries < 2)
> + return SIZE_MAX;
> + }
>
> #ifdef CONFIG_SMP
> off = ALIGN(off, SMP_CACHE_BYTES);
> @@ -3715,6 +3735,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
> if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
> (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
> return -EINVAL;
> + /*
> + * Nonsensical to ask for SQE128 and mixed SQE support, it's not
> + * supported to post 64b SQEs on a ring setup with SQE128.
> + */
> + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) ==
> + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED))
> + return -EINVAL;
>
> return 0;
> }
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 46d9141d772a7..85ed8eb7df80c 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -54,7 +54,8 @@
> IORING_SETUP_REGISTERED_FD_ONLY |\
> IORING_SETUP_NO_SQARRAY |\
> IORING_SETUP_HYBRID_IOPOLL |\
> - IORING_SETUP_CQE_MIXED)
> + IORING_SETUP_CQE_MIXED |\
> + IORING_SETUP_SQE_MIXED)
>
> #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
> IORING_ENTER_SQ_WAKEUP |\
> @@ -578,17 +579,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
> io_req_task_work_add(req);
> }
>
> -/*
> - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
> - * slot.
> - */
> -static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
> -{
> - if (ctx->flags & IORING_SETUP_SQE128)
> - return 2 * sizeof(struct io_uring_sqe);
> - return sizeof(struct io_uring_sqe);
> -}
> -
> static inline bool io_file_can_poll(struct io_kiocb *req)
> {
> if (req->flags & REQ_F_CAN_POLL)
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 932319633eac2..df52d760240e4 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = {
> .prep = io_pipe_prep,
> .issue = io_pipe,
> },
> + [IORING_OP_NOP128] = {
> + .audit_skip = 1,
> + .iopoll = 1,
> + .is_128 = 1,
> + .prep = io_nop_prep,
> + .issue = io_nop,
> + },
> + [IORING_OP_URING_CMD128] = {
> + .buffer_select = 1,
> + .needs_file = 1,
> + .plug = 1,
> + .iopoll = 1,
> + .iopoll_queue = 1,
> + .is_128 = 1,
> + .async_size = sizeof(struct io_async_cmd),
> + .prep = io_uring_cmd_prep,
> + .issue = io_uring_cmd,
> + },
> };
>
> const struct io_cold_def io_cold_defs[] = {
> @@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = {
> [IORING_OP_PIPE] = {
> .name = "PIPE",
> },
> + [IORING_OP_NOP128] = {
> + .name = "NOP128",
> + },
> + [IORING_OP_URING_CMD128] = {
> + .name = "URING_CMD128",
> + .sqe_copy = io_uring_cmd_sqe_copy,
> + .cleanup = io_uring_cmd_cleanup,
> + },
> };
>
> const char *io_uring_get_opcode(u8 opcode)
> diff --git a/io_uring/opdef.h b/io_uring/opdef.h
> index c2f0907ed78cc..aa37846880ffd 100644
> --- a/io_uring/opdef.h
> +++ b/io_uring/opdef.h
> @@ -27,6 +27,8 @@ struct io_issue_def {
> unsigned iopoll_queue : 1;
> /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
> unsigned vectored : 1;
> + /* set to 1 if this opcode uses 128b sqes in a mixed sq */
> + unsigned is_128 : 1;
>
> /* size of async data needed, if any */
> unsigned short async_size;
> diff --git a/io_uring/register.c b/io_uring/register.c
> index 43f04c47522c0..e97d9cbba7111 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -395,7 +395,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
> #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
> #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
> IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
> - IORING_SETUP_CQE_MIXED)
> + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
>
> static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
> {
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index d1e3ba62ee8e8..a89b29cc5d199 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> return 0;
> }
>
> +/*
> + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
> + * slot.
> + */
> +static inline size_t uring_sqe_size(struct io_kiocb *req)
> +{
> + if (req->ctx->flags & IORING_SETUP_SQE128 ||
> + req->opcode == IORING_OP_URING_CMD128)
> + return 2 * sizeof(struct io_uring_sqe);
> + return sizeof(struct io_uring_sqe);
> +}
> +
> void io_uring_cmd_sqe_copy(struct io_kiocb *req)
> {
> struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> @@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req)
> /* Should not happen, as REQ_F_SQE_COPIED covers this */
> if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes))
> return;
> - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
> + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req));
> ioucmd->sqe = ac->sqes;
> }
>
> @@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
> if (ret)
> return ret;
>
> - if (ctx->flags & IORING_SETUP_SQE128)
> + if (ctx->flags & IORING_SETUP_SQE128 ||
> + req->opcode == IORING_OP_URING_CMD128)
> issue_flags |= IO_URING_F_SQE128;
> if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED))
> issue_flags |= IO_URING_F_CQE32;
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED
2025-10-14 22:33 ` Caleb Sander Mateos
@ 2025-10-15 2:03 ` Keith Busch
2025-10-16 18:06 ` Keith Busch
0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2025-10-15 2:03 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Keith Busch, io-uring, axboe
On Tue, Oct 14, 2025 at 03:33:19PM -0700, Caleb Sander Mateos wrote:
> On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
> > + /*
> > + * A 128b op on a non-128b SQ requires mixed SQE support as
> > + * well as 2 contiguous entries.
> > + */
> > + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
> > + !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
> > + return io_init_fail_req(req, -EINVAL);
> > + /*
> > + * A 128b operation on a mixed SQ uses two entries, so we have
> > + * to increment the head and decrement what's left.
> > + */
> > + ctx->cached_sq_head++;
> > + (*left)--;
>
> Hmm, io_submit_sqes() calls io_get_task_refs() at the start to
> decrement cached_refs by the number of SQEs (counting 128-byte SQEs
> twice) but io_put_task() only increments it once for each completed
> request (counting 128-byte SQEs once). Does that mean there's a
> refcount leak? Perhaps io_submit_sqes() or this block needs to
> increment cached_refs to account for each 128-byte SQE?
It looks like you're right. I think the increment option is the easiest
way to deal with it, just adding this line into the above:
+ current->io_uring->cached_refs++;
I'm going to take a moment to figure out a good way to test this because
I don't think I'm hitting any problem with the admittedly simple tests
I've offered to liburing, so I may be missing something.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED
2025-10-15 2:03 ` Keith Busch
@ 2025-10-16 18:06 ` Keith Busch
0 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2025-10-16 18:06 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Keith Busch, io-uring, axboe
On Tue, Oct 14, 2025 at 08:03:42PM -0600, Keith Busch wrote:
> On Tue, Oct 14, 2025 at 03:33:19PM -0700, Caleb Sander Mateos wrote:
> > On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
> > > + /*
> > > + * A 128b op on a non-128b SQ requires mixed SQE support as
> > > + * well as 2 contiguous entries.
> > > + */
> > > + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
> > > + !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
> > > + return io_init_fail_req(req, -EINVAL);
> > > + /*
> > > + * A 128b operation on a mixed SQ uses two entries, so we have
> > > + * to increment the head and decrement what's left.
> > > + */
> > > + ctx->cached_sq_head++;
> > > + (*left)--;
> >
> > Hmm, io_submit_sqes() calls io_get_task_refs() at the start to
> > decrement cached_refs by the number of SQEs (counting 128-byte SQEs
> > twice) but io_put_task() only increments it once for each completed
> > request (counting 128-byte SQEs once). Does that mean there's a
> > refcount leak? Perhaps io_submit_sqes() or this block needs to
> > increment cached_refs to account for each 128-byte SQE?
>
> It looks like you're right. I think the increment option is the easiest
> way to deal with it, just adding this line into the above:
>
> + current->io_uring->cached_refs++;
>
> I'm going to take a moment to figure out a good way to test this because
> I don't think I'm hitting any problem with the admittedly simple tests
> I've offered to liburing, so I may be missing something.
So the tests were in fact causing missing putting some usage references,
but I'm not sure how to check for such leakage. Everything ends up
clearing up once the ring closes, and there's no other visibility into
the refcount from user space. I had to add some trace_printks just to
verify it. The increment above gets everything back to normal, at least,
so will send a new version with that in.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCHv5 1/4] liburing: provide uring_cmd prep function
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
2025-10-13 18:00 ` [PATCHv5 0/1] io_uring: mixed submission queue entries sizes Keith Busch
2025-10-13 18:00 ` [PATCHv5 1/1] io_uring: add support for IORING_SETUP_SQE_MIXED Keith Busch
@ 2025-10-13 18:00 ` Keith Busch
2025-10-19 16:24 ` Caleb Sander Mateos
2025-10-13 18:00 ` [PATCHv5 2/4] Add support IORING_SETUP_SQE_MIXED Keith Busch
` (2 subsequent siblings)
5 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2025-10-13 18:00 UTC (permalink / raw)
To: io-uring, axboe, csander; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
The rw prep doesn't clear __pad1, which is a reserved field for
uring_cmd. If a prior submission in that entry did use that field, the
uring_cmd will fail the kernel's checks.
Also, the nvme uring_cmd tests had a couple places setting the sqe addr
and length, which are unused fields for the nvme uring_cmds, so they
shouldn't have been doing that, though had been checking these, so it
didn't cause any errors.
Provide a helper function specific to the uring_cmd preparation.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
src/include/liburing.h | 19 +++++++++++++++----
test/io_uring_passthrough.c | 14 ++++----------
2 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/src/include/liburing.h b/src/include/liburing.h
index c80bffd3..f7af20aa 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -1517,6 +1517,19 @@ IOURINGINLINE void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe,
__io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1);
}
+IOURINGINLINE void io_uring_prep_uring_cmd(struct io_uring_sqe *sqe,
+ int cmd_op,
+ int fd)
+ LIBURING_NOEXCEPT
+{
+ sqe->opcode = (__u8) IORING_OP_URING_CMD;
+ sqe->fd = fd;
+ sqe->cmd_op = cmd_op;
+ sqe->__pad1 = 0;
+ sqe->addr = 0ul;
+ sqe->len = 0;
+}
+
/*
* Prepare commands for sockets
*/
@@ -1529,11 +1542,10 @@ IOURINGINLINE void io_uring_prep_cmd_sock(struct io_uring_sqe *sqe,
int optlen)
LIBURING_NOEXCEPT
{
- io_uring_prep_rw(IORING_OP_URING_CMD, sqe, fd, NULL, 0, 0);
+ io_uring_prep_uring_cmd(sqe, cmd_op, fd);
sqe->optval = (unsigned long) (uintptr_t) optval;
sqe->optname = optname;
sqe->optlen = optlen;
- sqe->cmd_op = cmd_op;
sqe->level = level;
}
@@ -1607,8 +1619,7 @@ IOURINGINLINE void io_uring_prep_cmd_discard(struct io_uring_sqe *sqe,
uint64_t offset, uint64_t nbytes)
LIBURING_NOEXCEPT
{
- io_uring_prep_rw(IORING_OP_URING_CMD, sqe, fd, 0, 0, 0);
- sqe->cmd_op = BLOCK_URING_CMD_DISCARD;
+ io_uring_prep_uring_cmd(sqe, BLOCK_URING_CMD_DISCARD, fd);
sqe->addr = offset;
sqe->addr3 = nbytes;
}
diff --git a/test/io_uring_passthrough.c b/test/io_uring_passthrough.c
index beaa81ad..26051710 100644
--- a/test/io_uring_passthrough.c
+++ b/test/io_uring_passthrough.c
@@ -148,11 +148,9 @@ static int __test_io(const char *file, struct io_uring *ring, int tc, int read,
if (async)
sqe->flags |= IOSQE_ASYNC;
if (nonvec)
- sqe->cmd_op = NVME_URING_CMD_IO;
+ io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, use_fd);
else
- sqe->cmd_op = NVME_URING_CMD_IO_VEC;
- sqe->fd = use_fd;
- sqe->opcode = IORING_OP_URING_CMD;
+ io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO_VEC, use_fd);
if (do_fixed)
sqe->uring_cmd_flags |= IORING_URING_CMD_FIXED;
sqe->user_data = ((uint64_t)offset << 32) | i;
@@ -328,9 +326,7 @@ static int test_invalid_passthru_submit(const char *file)
}
sqe = io_uring_get_sqe(&ring);
- io_uring_prep_read(sqe, fd, vecs[0].iov_base, vecs[0].iov_len, 0);
- sqe->cmd_op = NVME_URING_CMD_IO;
- sqe->opcode = IORING_OP_URING_CMD;
+ io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, fd);
sqe->user_data = 1;
cmd = (struct nvme_uring_cmd *)sqe->cmd;
memset(cmd, 0, sizeof(struct nvme_uring_cmd));
@@ -401,10 +397,8 @@ static int test_io_uring_submit_enters(const char *file)
__u32 nlb;
sqe = io_uring_get_sqe(&ring);
- io_uring_prep_readv(sqe, fd, &vecs[i], 1, offset);
+ io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, fd);
sqe->user_data = i;
- sqe->opcode = IORING_OP_URING_CMD;
- sqe->cmd_op = NVME_URING_CMD_IO;
cmd = (struct nvme_uring_cmd *)sqe->cmd;
memset(cmd, 0, sizeof(struct nvme_uring_cmd));
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCHv5 1/4] liburing: provide uring_cmd prep function
2025-10-13 18:00 ` [PATCHv5 1/4] liburing: provide uring_cmd prep function Keith Busch
@ 2025-10-19 16:24 ` Caleb Sander Mateos
2025-10-21 16:45 ` Keith Busch
0 siblings, 1 reply; 12+ messages in thread
From: Caleb Sander Mateos @ 2025-10-19 16:24 UTC (permalink / raw)
To: Keith Busch; +Cc: io-uring, axboe, Keith Busch
Looks good to me, just a few minor comments.
On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> The rw prep doesn't clear __pad1, which is a reserved field for
io_uring_prep_rw() does assign to sqe->off, which is unioned with
cmd_op and __pad1. Though obviously __pad1 being set to 0 is dependent
on a offset being passed as 0 to io_uring_prep_rw(). But I certainly
agree a dedicated helper for IORING_OP_URING_CMD is a great
improvement.
> uring_cmd. If a prior submission in that entry did use that field, the
> uring_cmd will fail the kernel's checks.
>
> Also, the nvme uring_cmd tests had a couple places setting the sqe addr
> and length, which are unused fields for the nvme uring_cmds, so they
> shouldn't have been doing that, though had been checking these, so it
"had" -> "hadn't"?
> didn't cause any errors.
>
> Provide a helper function specific to the uring_cmd preparation.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> src/include/liburing.h | 19 +++++++++++++++----
> test/io_uring_passthrough.c | 14 ++++----------
> 2 files changed, 19 insertions(+), 14 deletions(-)
>
> diff --git a/src/include/liburing.h b/src/include/liburing.h
> index c80bffd3..f7af20aa 100644
> --- a/src/include/liburing.h
> +++ b/src/include/liburing.h
> @@ -1517,6 +1517,19 @@ IOURINGINLINE void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe,
> __io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1);
> }
>
> +IOURINGINLINE void io_uring_prep_uring_cmd(struct io_uring_sqe *sqe,
> + int cmd_op,
I see this is copied from io_uring_prep_cmd_sock(), but u32 is
probably more accurate.
> + int fd)
> + LIBURING_NOEXCEPT
> +{
> + sqe->opcode = (__u8) IORING_OP_URING_CMD;
Casting the constant seems unnecessary. Do compilers really warn about this?
> + sqe->fd = fd;
> + sqe->cmd_op = cmd_op;
> + sqe->__pad1 = 0;
> + sqe->addr = 0ul;
> + sqe->len = 0;
> +}
> +
> /*
> * Prepare commands for sockets
> */
> @@ -1529,11 +1542,10 @@ IOURINGINLINE void io_uring_prep_cmd_sock(struct io_uring_sqe *sqe,
> int optlen)
> LIBURING_NOEXCEPT
> {
> - io_uring_prep_rw(IORING_OP_URING_CMD, sqe, fd, NULL, 0, 0);
> + io_uring_prep_uring_cmd(sqe, cmd_op, fd);
> sqe->optval = (unsigned long) (uintptr_t) optval;
> sqe->optname = optname;
> sqe->optlen = optlen;
> - sqe->cmd_op = cmd_op;
> sqe->level = level;
> }
>
> @@ -1607,8 +1619,7 @@ IOURINGINLINE void io_uring_prep_cmd_discard(struct io_uring_sqe *sqe,
> uint64_t offset, uint64_t nbytes)
> LIBURING_NOEXCEPT
> {
> - io_uring_prep_rw(IORING_OP_URING_CMD, sqe, fd, 0, 0, 0);
> - sqe->cmd_op = BLOCK_URING_CMD_DISCARD;
> + io_uring_prep_uring_cmd(sqe, BLOCK_URING_CMD_DISCARD, fd);
> sqe->addr = offset;
> sqe->addr3 = nbytes;
> }
> diff --git a/test/io_uring_passthrough.c b/test/io_uring_passthrough.c
> index beaa81ad..26051710 100644
> --- a/test/io_uring_passthrough.c
> +++ b/test/io_uring_passthrough.c
> @@ -148,11 +148,9 @@ static int __test_io(const char *file, struct io_uring *ring, int tc, int read,
> if (async)
> sqe->flags |= IOSQE_ASYNC;
> if (nonvec)
> - sqe->cmd_op = NVME_URING_CMD_IO;
> + io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, use_fd);
I guess this works because io_uring_prep_uring_cmd() doesn't touch
sqe->buf_index or sqe->flags, but it seems like it would be less
brittle to call io_uring_prep_uring_cmd() before setting any of the
other sqe fields.
> else
> - sqe->cmd_op = NVME_URING_CMD_IO_VEC;
> - sqe->fd = use_fd;
> - sqe->opcode = IORING_OP_URING_CMD;
> + io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO_VEC, use_fd);
> if (do_fixed)
> sqe->uring_cmd_flags |= IORING_URING_CMD_FIXED;
> sqe->user_data = ((uint64_t)offset << 32) | i;
> @@ -328,9 +326,7 @@ static int test_invalid_passthru_submit(const char *file)
> }
>
> sqe = io_uring_get_sqe(&ring);
> - io_uring_prep_read(sqe, fd, vecs[0].iov_base, vecs[0].iov_len, 0);
> - sqe->cmd_op = NVME_URING_CMD_IO;
> - sqe->opcode = IORING_OP_URING_CMD;
> + io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, fd);
> sqe->user_data = 1;
> cmd = (struct nvme_uring_cmd *)sqe->cmd;
> memset(cmd, 0, sizeof(struct nvme_uring_cmd));
> @@ -401,10 +397,8 @@ static int test_io_uring_submit_enters(const char *file)
> __u32 nlb;
>
> sqe = io_uring_get_sqe(&ring);
> - io_uring_prep_readv(sqe, fd, &vecs[i], 1, offset);
> + io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, fd);
> sqe->user_data = i;
> - sqe->opcode = IORING_OP_URING_CMD;
> - sqe->cmd_op = NVME_URING_CMD_IO;
> cmd = (struct nvme_uring_cmd *)sqe->cmd;
> memset(cmd, 0, sizeof(struct nvme_uring_cmd));
>
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCHv5 1/4] liburing: provide uring_cmd prep function
2025-10-19 16:24 ` Caleb Sander Mateos
@ 2025-10-21 16:45 ` Keith Busch
0 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2025-10-21 16:45 UTC (permalink / raw)
To: Caleb Sander Mateos; +Cc: Keith Busch, io-uring, axboe
On Sun, Oct 19, 2025 at 09:24:10AM -0700, Caleb Sander Mateos wrote:
> On Mon, Oct 13, 2025 at 11:00 AM Keith Busch <kbusch@meta.com> wrote:
> > + int fd)
> > + LIBURING_NOEXCEPT
> > +{
> > + sqe->opcode = (__u8) IORING_OP_URING_CMD;
>
> Casting the constant seems unnecessary. Do compilers really warn about this?
Oh, not necessary here, but the next patch wants the cast. This was
copied from io_uring_prep_rw(), which passes the 'opcode' as an 'int'
type. I don't know why that type was used, so just trying to match the
local convention.
But I digress, I'll move the cast to the next patch where it is actually
needed.
> > + io_uring_prep_uring_cmd(sqe, NVME_URING_CMD_IO, use_fd);
>
> I guess this works because io_uring_prep_uring_cmd() doesn't touch
> sqe->buf_index or sqe->flags, but it seems like it would be less
> brittle to call io_uring_prep_uring_cmd() before setting any of the
> other sqe fields.
Good point, it happens to be "okay", but is safer to move the generic
init before initializing command specific fields.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCHv5 2/4] Add support IORING_SETUP_SQE_MIXED
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
` (2 preceding siblings ...)
2025-10-13 18:00 ` [PATCHv5 1/4] liburing: provide uring_cmd prep function Keith Busch
@ 2025-10-13 18:00 ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 3/4] Add nop testing for IORING_SETUP_SQE_MIXED Keith Busch
2025-10-13 18:00 ` [PATCHv5 4/4] Add mixed sqe test for uring commands Keith Busch
5 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2025-10-13 18:00 UTC (permalink / raw)
To: io-uring, axboe, csander; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
This adds core support for mixed sized SQEs in the same SQ ring. Before
this, SQEs were either 64b in size (the normal size), or 128b if
IORING_SETUP_SQE128 was set in the ring initialization. With the mixed
support, an SQE may be either 64b or 128b on the same SQ ring. If the
SQE is 128b in size, then a 128b opcode will be set in the sqe op. When
acquiring a large sqe at the end of the sq, the client may post a NOP
SQE with IOSQE_CQE_SKIP_SUCCESS set that the kernel will process and
skip posting a CQE.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
src/include/liburing.h | 71 +++++++++++++++++++++++++++++++--
src/include/liburing/io_uring.h | 8 ++++
2 files changed, 75 insertions(+), 4 deletions(-)
diff --git a/src/include/liburing.h b/src/include/liburing.h
index f7af20aa..d6a45cbb 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -800,6 +800,12 @@ IOURINGINLINE void io_uring_prep_nop(struct io_uring_sqe *sqe)
io_uring_prep_rw(IORING_OP_NOP, sqe, -1, NULL, 0, 0);
}
+IOURINGINLINE void io_uring_prep_nop128(struct io_uring_sqe *sqe)
+ LIBURING_NOEXCEPT
+{
+ io_uring_prep_rw(IORING_OP_NOP128, sqe, -1, NULL, 0, 0);
+}
+
IOURINGINLINE void io_uring_prep_timeout(struct io_uring_sqe *sqe,
const struct __kernel_timespec *ts,
unsigned count, unsigned flags)
@@ -1517,12 +1523,13 @@ IOURINGINLINE void io_uring_prep_socket_direct_alloc(struct io_uring_sqe *sqe,
__io_uring_set_target_fixed_file(sqe, IORING_FILE_INDEX_ALLOC - 1);
}
-IOURINGINLINE void io_uring_prep_uring_cmd(struct io_uring_sqe *sqe,
- int cmd_op,
- int fd)
+IOURINGINLINE void __io_uring_prep_uring_cmd(struct io_uring_sqe *sqe,
+ int op,
+ int cmd_op,
+ int fd)
LIBURING_NOEXCEPT
{
- sqe->opcode = (__u8) IORING_OP_URING_CMD;
+ sqe->opcode = (__u8) op;
sqe->fd = fd;
sqe->cmd_op = cmd_op;
sqe->__pad1 = 0;
@@ -1530,6 +1537,22 @@ IOURINGINLINE void io_uring_prep_uring_cmd(struct io_uring_sqe *sqe,
sqe->len = 0;
}
+IOURINGINLINE void io_uring_prep_uring_cmd(struct io_uring_sqe *sqe,
+ int cmd_op,
+ int fd)
+ LIBURING_NOEXCEPT
+{
+ __io_uring_prep_uring_cmd(sqe, IORING_OP_URING_CMD, cmd_op, fd);
+}
+
+IOURINGINLINE void io_uring_prep_uring_cmd128(struct io_uring_sqe *sqe,
+ int cmd_op,
+ int fd)
+ LIBURING_NOEXCEPT
+{
+ __io_uring_prep_uring_cmd(sqe, IORING_OP_URING_CMD128, cmd_op, fd);
+}
+
/*
* Prepare commands for sockets
*/
@@ -1894,6 +1917,46 @@ IOURINGINLINE struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring)
return sqe;
}
+/*
+ * Return a 128B sqe to fill. Applications must later call io_uring_submit()
+ * when it's ready to tell the kernel about it. The caller may call this
+ * function multiple times before calling io_uring_submit().
+ *
+ * Returns a vacant 128B sqe, or NULL if we're full. If the current tail is the
+ * last entry in the ring, this function will insert a nop + skip complete such
+ * that the 128b entry wraps back to the beginning of the queue for a
+ * contiguous big sq entry. It's up to the caller to use a 128b opcode in order
+ * for the kernel to know how to advance its sq head pointer.
+ */
+IOURINGINLINE struct io_uring_sqe *io_uring_get_sqe128_mixed(struct io_uring *ring)
+ LIBURING_NOEXCEPT
+{
+ struct io_uring_sq *sq = &ring->sq;
+ unsigned head = io_uring_load_sq_head(ring), tail = sq->sqe_tail;
+ struct io_uring_sqe *sqe;
+
+ if (!(ring->flags & IORING_SETUP_SQE_MIXED))
+ return NULL;
+
+ if (((tail + 1) & sq->ring_mask) == 0) {
+ if ((tail + 2) - head >= sq->ring_entries)
+ return NULL;
+
+ sqe = _io_uring_get_sqe(ring);
+ io_uring_prep_nop(sqe);
+ sqe->flags |= IOSQE_CQE_SKIP_SUCCESS;
+ tail = sq->sqe_tail;
+ } else if ((tail + 1) - head >= sq->ring_entries) {
+ return NULL;
+ }
+
+ sqe = &sq->sqes[tail & sq->ring_mask];
+ sq->sqe_tail = tail + 2;
+ io_uring_initialize_sqe(sqe);
+
+ return sqe;
+}
+
/*
* Return the appropriate mask for a buffer ring of size 'ring_entries'
*/
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 31396057..f2388645 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -211,6 +211,12 @@ enum io_uring_sqe_flags_bit {
*/
#define IORING_SETUP_CQE_MIXED (1U << 18)
+/*
+ * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have
+ * IOSQE_SQE_128B set in sqe->flags.
+ */
+#define IORING_SETUP_SQE_MIXED (1U << 19)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -275,6 +281,8 @@ enum io_uring_op {
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,
IORING_OP_PIPE,
+ IORING_OP_NOP128,
+ IORING_OP_URING_CMD128,
/* this goes last, obviously */
IORING_OP_LAST,
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCHv5 3/4] Add nop testing for IORING_SETUP_SQE_MIXED
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
` (3 preceding siblings ...)
2025-10-13 18:00 ` [PATCHv5 2/4] Add support IORING_SETUP_SQE_MIXED Keith Busch
@ 2025-10-13 18:00 ` Keith Busch
2025-10-13 18:00 ` [PATCHv5 4/4] Add mixed sqe test for uring commands Keith Busch
5 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2025-10-13 18:00 UTC (permalink / raw)
To: io-uring, axboe, csander; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Test mixing 64 and 128 byte sqe entries on a queue.
Insert a bad 128b operation at the end of a mixed sqe to test the
kernel's invalid entry detection.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
test/Makefile | 2 +
test/sqe-mixed-bad-wrap.c | 87 ++++++++++++++++++++++++++++++++++++++
test/sqe-mixed-nop.c | 82 +++++++++++++++++++++++++++++++++++
test/sqe-mixed-uring_cmd.c | 0
4 files changed, 171 insertions(+)
create mode 100644 test/sqe-mixed-bad-wrap.c
create mode 100644 test/sqe-mixed-nop.c
create mode 100644 test/sqe-mixed-uring_cmd.c
diff --git a/test/Makefile b/test/Makefile
index 64d67a1e..2c250c81 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -234,6 +234,8 @@ test_srcs := \
sq-poll-share.c \
sqpoll-sleep.c \
sq-space_left.c \
+ sqe-mixed-nop.c \
+ sqe-mixed-bad-wrap.c \
sqwait.c \
stdout.c \
submit-and-wait.c \
diff --git a/test/sqe-mixed-bad-wrap.c b/test/sqe-mixed-bad-wrap.c
new file mode 100644
index 00000000..d67f06c5
--- /dev/null
+++ b/test/sqe-mixed-bad-wrap.c
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various nop tests
+ *
+ */
+#include <stdio.h>
+
+#include "liburing.h"
+#include "helpers.h"
+#include "test.h"
+
+static int seq;
+
+static int test_single_nop(struct io_uring *ring, bool should_fail)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return T_EXIT_FAIL;
+ }
+
+ if (should_fail)
+ io_uring_prep_nop128(sqe);
+ else
+ io_uring_prep_nop(sqe);
+ sqe->user_data = ++seq;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0)
+ fprintf(stderr, "wait completion %d\n", ret);
+ else if (should_fail && cqe->res == 0)
+ fprintf(stderr, "Unexpected success\n");
+ else if (!should_fail && cqe->res != 0)
+ fprintf(stderr, "Completion error:%d\n", cqe->res);
+ else if (cqe->res == 0 && cqe->user_data != seq)
+ fprintf(stderr, "Unexpected user_data: %ld\n", (long) cqe->user_data);
+ else {
+ io_uring_cqe_seen(ring, cqe);
+ return T_EXIT_PASS;
+ }
+ return T_EXIT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret, i;
+
+ if (argc > 1)
+ return T_EXIT_SKIP;
+
+ ret = io_uring_queue_init(8, &ring, IORING_SETUP_SQE_MIXED);
+ if (ret) {
+ if (ret == -EINVAL)
+ return T_EXIT_SKIP;
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ /* prime the sq to the last entry before wrapping */
+ for (i = 0; i < 7; i++) {
+ ret = test_single_nop(&ring, false);
+ if (ret != T_EXIT_PASS)
+ goto done;
+ }
+
+ /* inserting a 128b sqe in the last entry should fail */
+ ret = test_single_nop(&ring, true);
+ if (ret != T_EXIT_PASS)
+ goto done;
+
+ /* proceeding from the bad wrap should succeed */
+ ret = test_single_nop(&ring, false);
+done:
+ io_uring_queue_exit(&ring);
+ return ret;
+}
diff --git a/test/sqe-mixed-nop.c b/test/sqe-mixed-nop.c
new file mode 100644
index 00000000..88bd6ad2
--- /dev/null
+++ b/test/sqe-mixed-nop.c
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various nop tests
+ *
+ */
+#include <stdio.h>
+
+#include "liburing.h"
+#include "helpers.h"
+#include "test.h"
+
+static int seq;
+
+static int test_single_nop(struct io_uring *ring, bool sqe128)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ if (sqe128)
+ sqe = io_uring_get_sqe128_mixed(ring);
+ else
+ sqe = io_uring_get_sqe(ring);
+
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return T_EXIT_FAIL;
+ }
+
+ if (sqe128)
+ io_uring_prep_nop128(sqe);
+ else
+ io_uring_prep_nop(sqe);
+
+ sqe->user_data = ++seq;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0)
+ fprintf(stderr, "wait completion %d\n", ret);
+ else if (cqe->res != 0)
+ fprintf(stderr, "Completion error:%d\n", cqe->res);
+ else if (cqe->user_data != seq)
+ fprintf(stderr, "Unexpected user_data: %ld\n", (long) cqe->user_data);
+ else {
+ io_uring_cqe_seen(ring, cqe);
+ return T_EXIT_PASS;
+ }
+ return T_EXIT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int ret, i;
+
+ if (argc > 1)
+ return T_EXIT_SKIP;
+
+ ret = io_uring_queue_init(8, &ring, IORING_SETUP_SQE_MIXED);
+ if (ret) {
+ if (ret == -EINVAL)
+ return T_EXIT_SKIP;
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ /* alternate big and little sqe's */
+ for (i = 0; i < 32; i++) {
+ ret = test_single_nop(&ring, i & 1);
+ if (ret != T_EXIT_PASS)
+ break;
+ }
+
+ io_uring_queue_exit(&ring);
+ return ret;
+}
diff --git a/test/sqe-mixed-uring_cmd.c b/test/sqe-mixed-uring_cmd.c
new file mode 100644
index 00000000..e69de29b
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCHv5 4/4] Add mixed sqe test for uring commands
2025-10-13 18:00 [PATCHv5 0/4] liburing: support for mix sized sqe's Keith Busch
` (4 preceding siblings ...)
2025-10-13 18:00 ` [PATCHv5 3/4] Add nop testing for IORING_SETUP_SQE_MIXED Keith Busch
@ 2025-10-13 18:00 ` Keith Busch
5 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2025-10-13 18:00 UTC (permalink / raw)
To: io-uring, axboe, csander; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
test/Makefile | 1 +
test/sqe-mixed-uring_cmd.c | 140 +++++++++++++++++++++++++++++++++++++
2 files changed, 141 insertions(+)
diff --git a/test/Makefile b/test/Makefile
index 2c250c81..2b2e3967 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -236,6 +236,7 @@ test_srcs := \
sq-space_left.c \
sqe-mixed-nop.c \
sqe-mixed-bad-wrap.c \
+ sqe-mixed-uring_cmd.c \
sqwait.c \
stdout.c \
submit-and-wait.c \
diff --git a/test/sqe-mixed-uring_cmd.c b/test/sqe-mixed-uring_cmd.c
index e69de29b..7ac5f4ab 100644
--- a/test/sqe-mixed-uring_cmd.c
+++ b/test/sqe-mixed-uring_cmd.c
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: mixed sqes utilizing basic nop and io_uring passthrough commands
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "helpers.h"
+#include "liburing.h"
+#include "nvme.h"
+
+#define len 0x1000
+static unsigned char buf[len];
+static int seq;
+
+static int test_single_nop(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return T_EXIT_FAIL;
+ }
+
+ io_uring_prep_nop(sqe);
+ sqe->user_data = ++seq;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0)
+ fprintf(stderr, "wait completion %d\n", ret);
+ else if (cqe->user_data != seq)
+ fprintf(stderr, "Unexpected user_data: %ld\n", (long) cqe->user_data);
+ else {
+ io_uring_cqe_seen(ring, cqe);
+ return T_EXIT_PASS;
+ }
+ return T_EXIT_FAIL;
+}
+
+static int test_single_nvme_read(struct io_uring *ring, int fd)
+{
+ struct nvme_uring_cmd *cmd;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int ret;
+
+ sqe = io_uring_get_sqe128_mixed(ring);
+ if (!sqe) {
+ fprintf(stderr, "get sqe failed\n");
+ return T_EXIT_FAIL;
+ }
+
+ io_uring_prep_uring_cmd128(sqe, NVME_URING_CMD_IO, fd);
+ sqe->user_data = ++seq;
+
+ cmd = (struct nvme_uring_cmd *)sqe->cmd;
+ memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+ cmd->opcode = nvme_cmd_read;
+ cmd->cdw12 = (len >> lba_shift) - 1;
+ cmd->addr = (__u64)(uintptr_t)buf;
+ cmd->data_len = len;
+ cmd->nsid = nsid;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0)
+ fprintf(stderr, "wait completion %d\n", ret);
+ else if (cqe->res != 0)
+ fprintf(stderr, "cqe res %d, wanted 0\n", cqe->res);
+ else if (cqe->user_data != seq)
+ fprintf(stderr, "Unexpected user_data: %ld\n", (long) cqe->user_data);
+ else {
+ io_uring_cqe_seen(ring, cqe);
+ return T_EXIT_PASS;
+ }
+ return T_EXIT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ int fd, ret, i;
+
+ if (argc < 2)
+ return T_EXIT_SKIP;
+
+ ret = nvme_get_info(argv[1]);
+ if (ret)
+ return T_EXIT_SKIP;
+
+ fd = open(argv[1], O_RDONLY);
+ if (fd < 0) {
+ if (errno == EACCES || errno == EPERM)
+ return T_EXIT_SKIP;
+ perror("file open");
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_queue_init(8, &ring,
+ IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED);
+ if (ret) {
+ if (ret == -EINVAL)
+ ret = T_EXIT_SKIP;
+ else {
+ fprintf(stderr, "ring setup failed: %d\n", ret);
+ ret = T_EXIT_FAIL;
+ }
+ goto close;
+ }
+
+ for (i = 0; i < 32; i++) {
+ if (i & 1)
+ ret = test_single_nvme_read(&ring, fd);
+ else
+ ret = test_single_nop(&ring);
+
+ if (ret)
+ break;
+ }
+
+ io_uring_queue_exit(&ring);
+close:
+ close(fd);
+ return ret;
+}
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread