From: Pavel Begunkov <asml.silence@gmail.com>
To: io-uring@vger.kernel.org
Cc: asml.silence@gmail.com
Subject: [RFC 10/16] io_uring: separate cqe array from headers
Date: Thu, 6 Nov 2025 17:01:49 +0000 [thread overview]
Message-ID: <274184bd22b625f4420232540ea8801ba4faf98f.1762447538.git.asml.silence@gmail.com> (raw)
In-Reply-To: <cover.1762447538.git.asml.silence@gmail.com>
Keep a pointer to the CQ separate from SCQ headers, it'll be used
shortly in next patches. Also, don't overestimate the CQ size for
SETUP_CQE32, which not only doubles memory for CQ entries but also the
headers as well.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/linux/io_uring_types.h | 17 +++++++++--------
io_uring/fdinfo.c | 2 +-
io_uring/io_uring.c | 35 ++++++++++++++++++++++------------
io_uring/io_uring.h | 1 +
io_uring/register.c | 8 +++++++-
5 files changed, 41 insertions(+), 22 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 92780764d5fa..91ded559a147 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -209,14 +209,6 @@ struct io_rings {
* ordered with any other data.
*/
u32 cq_overflow;
- /*
- * Ring buffer of completion events.
- *
- * The kernel writes completion events fresh every time they are
- * produced, so the application is allowed to modify pending
- * entries.
- */
- struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
struct io_restriction {
@@ -274,6 +266,15 @@ struct io_ring_ctx {
struct task_struct *submitter_task;
struct io_rings *rings;
+ /*
+ * Ring buffer of completion events.
+ *
+ * The kernel writes completion events fresh every time they are
+ * produced, so the application is allowed to modify pending
+ * entries.
+ */
+ struct io_uring_cqe *cqes;
+
struct percpu_ref refs;
clockid_t clockid;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ac6e7edc7027..eae13ac9b1a9 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -153,7 +153,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
struct io_uring_cqe *cqe;
bool cqe32 = false;
- cqe = &r->cqes[(cq_head & cq_mask)];
+ cqe = &ctx->cqes[(cq_head & cq_mask)];
if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32)
cqe32 = true;
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index be866a8e94bf..9aef41f6ce23 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -745,7 +745,7 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
{
if (__io_cqring_events(ctx) < ctx->cq_entries) {
- struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
+ struct io_uring_cqe *cqe = &ctx->cqes[off];
cqe->user_data = 0;
cqe->res = 0;
@@ -763,7 +763,6 @@ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
*/
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
{
- struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
unsigned int free, queued, len;
@@ -798,7 +797,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
len <<= 1;
}
- ctx->cqe_cached = &rings->cqes[off];
+ ctx->cqe_cached = &ctx->cqes[off];
ctx->cqe_sentinel = ctx->cqe_cached + len;
return true;
}
@@ -2760,8 +2759,8 @@ static void io_rings_free(struct io_ring_ctx *ctx)
int rings_size(unsigned int flags, unsigned int sq_entries,
unsigned int cq_entries, struct io_scq_dim *dims)
{
- struct io_rings *rings;
size_t off, sq_array_size;
+ size_t cq_size, cqe_size;
size_t sqe_size;
dims->sq_array_offset = SIZE_MAX;
@@ -2769,18 +2768,26 @@ int rings_size(unsigned int flags, unsigned int sq_entries,
sqe_size = sizeof(struct io_uring_sqe);
if (flags & IORING_SETUP_SQE128)
sqe_size *= 2;
+ cqe_size = sizeof(struct io_uring_cqe);
+ if (flags & IORING_SETUP_CQE32)
+ cqe_size *= 2;
dims->sq_size = array_size(sqe_size, sq_entries);
if (dims->sq_size == SIZE_MAX)
return -EOVERFLOW;
- off = struct_size(rings, cqes, cq_entries);
+ off = sizeof(struct io_rings);
+ off = L1_CACHE_ALIGN(off);
+ dims->cq_offset = off;
+
+ cq_size = array_size(cqe_size, cq_entries);
+ if (cq_size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ off = size_add(off, cq_size);
if (off == SIZE_MAX)
return -EOVERFLOW;
- if (flags & IORING_SETUP_CQE32) {
- if (check_shl_overflow(off, 1, &off))
- return -EOVERFLOW;
- }
+
if (flags & IORING_SETUP_CQE_MIXED) {
if (cq_entries < 2)
return -EOVERFLOW;
@@ -3368,6 +3375,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_scq_dim *dims = &config->dims;
struct io_uring_region_desc rd;
struct io_rings *rings;
+ void *ptr;
int ret;
/* make sure these are sane, as we already accounted them */
@@ -3383,9 +3391,12 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
if (ret)
return ret;
- ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
+ ptr = io_region_get_ptr(&ctx->ring_region);
+ ctx->rings = rings = ptr;
+ ctx->cqes = ptr + config->dims.cq_offset;
+
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
- ctx->sq_array = (u32 *)((char *)rings + dims->sq_array_offset);
+ ctx->sq_array = ptr + dims->sq_array_offset;
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(dims->sq_size);
@@ -3504,7 +3515,7 @@ void io_fill_scq_offsets(struct io_uring_params *p, struct io_scq_dim *dims)
p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
- p->cq_off.cqes = offsetof(struct io_rings, cqes);
+ p->cq_off.cqes = dims->cq_offset;
p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->cq_off.resv1 = 0;
if (!(p->flags & IORING_SETUP_NO_MMAP))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f6c4b141a33d..80228c5a843c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -20,6 +20,7 @@
struct io_scq_dim {
size_t sq_array_offset;
size_t sq_size;
+ size_t cq_offset;
/* Compound array mmap'ed together with CQ. */
size_t cq_comp_size;
diff --git a/io_uring/register.c b/io_uring/register.c
index da804f925622..b43a121e2974 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -373,6 +373,7 @@ static int io_register_clock(struct io_ring_ctx *ctx,
struct io_ring_ctx_rings {
struct io_rings *rings;
struct io_uring_sqe *sq_sqes;
+ struct io_uring_cqe *cqes;
struct io_mapped_region sq_region;
struct io_mapped_region ring_region;
@@ -439,6 +440,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
return ret;
n.rings = io_region_get_ptr(&n.ring_region);
+ n.cqes = io_region_get_ptr(&n.ring_region) + dims.cq_offset;
/*
* At this point n.rings is shared with userspace, just like o.rings
@@ -497,6 +499,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
ctx->rings = NULL;
o.sq_sqes = ctx->sq_sqes;
ctx->sq_sqes = NULL;
+ o.cqes = ctx->cqes;
+ ctx->cqes = NULL;
/*
* Now copy SQ and CQ entries, if any. If either of the destination
@@ -522,6 +526,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
/* restore old rings, and return -EOVERFLOW via cleanup path */
ctx->rings = o.rings;
ctx->sq_sqes = o.sq_sqes;
+ ctx->cqes = o.cqes;
to_free = &n;
ret = -EOVERFLOW;
goto out;
@@ -530,7 +535,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
unsigned src_head = i & (ctx->cq_entries - 1);
unsigned dst_head = i & (p.cq_entries - 1);
- n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+ n.cqes[dst_head] = o.cqes[src_head];
}
WRITE_ONCE(n.rings->cq.head, old_head);
WRITE_ONCE(n.rings->cq.tail, tail);
@@ -551,6 +556,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
ctx->rings = n.rings;
ctx->sq_sqes = n.sq_sqes;
+ ctx->cqes = n.cqes;
swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region);
to_free = &o;
--
2.49.0
next prev parent reply other threads:[~2025-11-06 17:02 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-06 17:01 [RFC 00/16] Introduce ring flexible placement Pavel Begunkov
2025-11-06 17:01 ` [RFC 01/16] io_uring: add helper calculating region byte size Pavel Begunkov
2025-11-06 17:01 ` [RFC 02/16] io_uring: pass sq entires in the params struct Pavel Begunkov
2025-11-06 17:01 ` [RFC 03/16] io_uring: use mem_is_zero to check ring params Pavel Begunkov
2025-11-06 17:01 ` [RFC 04/16] io_uring: move flags check to io_uring_sanitise_params Pavel Begunkov
2025-11-06 17:01 ` [RFC 05/16] io_uring: introduce struct io_ctx_config Pavel Begunkov
2025-11-06 17:01 ` [RFC 06/16] io_uring: split out config init helper Pavel Begunkov
2025-11-06 17:01 ` [RFC 07/16] io_uring: add structure keeping ring offsets Pavel Begunkov
2025-11-06 17:01 ` [RFC 08/16] io_uring: pre-calculate scq offsets Pavel Begunkov
2025-11-06 17:01 ` [RFC 09/16] io_uring: inroduce helper for setting user offset Pavel Begunkov
2025-11-06 17:01 ` Pavel Begunkov [this message]
2025-11-06 17:01 ` [RFC 11/16] io_uring/region: introduce io_region_slice Pavel Begunkov
2025-11-06 17:01 ` [RFC 12/16] io_uring: convert pointer init to io_region_slice Pavel Begunkov
2025-11-06 17:01 ` [RFC 13/16] io_uring: refactor rings_size() Pavel Begunkov
2025-11-06 17:01 ` [RFC 14/16] io_uring: extract io_create_mem_region Pavel Begunkov
2025-11-06 17:01 ` [RFC 15/16] io_uring: allow creating mem region at setup Pavel Begunkov
2025-11-06 17:01 ` [RFC 16/16] io_uring: introduce SCQ placement Pavel Begunkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=274184bd22b625f4420232540ea8801ba4faf98f.1762447538.git.asml.silence@gmail.com \
--to=asml.silence@gmail.com \
--cc=io-uring@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox