public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Pavel Begunkov <asml.silence@gmail.com>
To: io-uring@vger.kernel.org
Cc: asml.silence@gmail.com
Subject: [RFC 10/16] io_uring: separate cqe array from headers
Date: Thu,  6 Nov 2025 17:01:49 +0000	[thread overview]
Message-ID: <274184bd22b625f4420232540ea8801ba4faf98f.1762447538.git.asml.silence@gmail.com> (raw)
In-Reply-To: <cover.1762447538.git.asml.silence@gmail.com>

Keep a pointer to the CQ separate from SCQ headers, it'll be used
shortly in next patches. Also, don't overestimate the CQ size for
SETUP_CQE32, which not only doubles memory for CQ entries but also the
headers as well.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/io_uring_types.h | 17 +++++++++--------
 io_uring/fdinfo.c              |  2 +-
 io_uring/io_uring.c            | 35 ++++++++++++++++++++++------------
 io_uring/io_uring.h            |  1 +
 io_uring/register.c            |  8 +++++++-
 5 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 92780764d5fa..91ded559a147 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -209,14 +209,6 @@ struct io_rings {
 	 * ordered with any other data.
 	 */
 	u32			cq_overflow;
-	/*
-	 * Ring buffer of completion events.
-	 *
-	 * The kernel writes completion events fresh every time they are
-	 * produced, so the application is allowed to modify pending
-	 * entries.
-	 */
-	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
 struct io_restriction {
@@ -274,6 +266,15 @@ struct io_ring_ctx {
 
 		struct task_struct	*submitter_task;
 		struct io_rings		*rings;
+		/*
+		 * Ring buffer of completion events.
+		 *
+		 * The kernel writes completion events fresh every time they are
+		 * produced, so the application is allowed to modify pending
+		 * entries.
+		 */
+		struct io_uring_cqe	*cqes;
+
 		struct percpu_ref	refs;
 
 		clockid_t		clockid;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ac6e7edc7027..eae13ac9b1a9 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -153,7 +153,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 		struct io_uring_cqe *cqe;
 		bool cqe32 = false;
 
-		cqe = &r->cqes[(cq_head & cq_mask)];
+		cqe = &ctx->cqes[(cq_head & cq_mask)];
 		if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32)
 			cqe32 = true;
 		seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index be866a8e94bf..9aef41f6ce23 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -745,7 +745,7 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
 static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
 {
 	if (__io_cqring_events(ctx) < ctx->cq_entries) {
-		struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
+		struct io_uring_cqe *cqe = &ctx->cqes[off];
 
 		cqe->user_data = 0;
 		cqe->res = 0;
@@ -763,7 +763,6 @@ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
  */
 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
 {
-	struct io_rings *rings = ctx->rings;
 	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
 	unsigned int free, queued, len;
 
@@ -798,7 +797,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
 		len <<= 1;
 	}
 
-	ctx->cqe_cached = &rings->cqes[off];
+	ctx->cqe_cached = &ctx->cqes[off];
 	ctx->cqe_sentinel = ctx->cqe_cached + len;
 	return true;
 }
@@ -2760,8 +2759,8 @@ static void io_rings_free(struct io_ring_ctx *ctx)
 int rings_size(unsigned int flags, unsigned int sq_entries,
 	       unsigned int cq_entries, struct io_scq_dim *dims)
 {
-	struct io_rings *rings;
 	size_t off, sq_array_size;
+	size_t cq_size, cqe_size;
 	size_t sqe_size;
 
 	dims->sq_array_offset = SIZE_MAX;
@@ -2769,18 +2768,26 @@ int rings_size(unsigned int flags, unsigned int sq_entries,
 	sqe_size = sizeof(struct io_uring_sqe);
 	if (flags & IORING_SETUP_SQE128)
 		sqe_size *= 2;
+	cqe_size = sizeof(struct io_uring_cqe);
+	if (flags & IORING_SETUP_CQE32)
+		cqe_size *= 2;
 
 	dims->sq_size = array_size(sqe_size, sq_entries);
 	if (dims->sq_size == SIZE_MAX)
 		return -EOVERFLOW;
 
-	off = struct_size(rings, cqes, cq_entries);
+	off = sizeof(struct io_rings);
+	off = L1_CACHE_ALIGN(off);
+	dims->cq_offset = off;
+
+	cq_size = array_size(cqe_size, cq_entries);
+	if (cq_size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	off = size_add(off, cq_size);
 	if (off == SIZE_MAX)
 		return -EOVERFLOW;
-	if (flags & IORING_SETUP_CQE32) {
-		if (check_shl_overflow(off, 1, &off))
-			return -EOVERFLOW;
-	}
+
 	if (flags & IORING_SETUP_CQE_MIXED) {
 		if (cq_entries < 2)
 			return -EOVERFLOW;
@@ -3368,6 +3375,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	struct io_scq_dim *dims = &config->dims;
 	struct io_uring_region_desc rd;
 	struct io_rings *rings;
+	void *ptr;
 	int ret;
 
 	/* make sure these are sane, as we already accounted them */
@@ -3383,9 +3391,12 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
 	if (ret)
 		return ret;
-	ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
+	ptr = io_region_get_ptr(&ctx->ring_region);
+	ctx->rings = rings = ptr;
+	ctx->cqes = ptr + config->dims.cq_offset;
+
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
-		ctx->sq_array = (u32 *)((char *)rings + dims->sq_array_offset);
+		ctx->sq_array = ptr + dims->sq_array_offset;
 
 	memset(&rd, 0, sizeof(rd));
 	rd.size = PAGE_ALIGN(dims->sq_size);
@@ -3504,7 +3515,7 @@ void io_fill_scq_offsets(struct io_uring_params *p, struct io_scq_dim *dims)
 	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
 	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
 	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
-	p->cq_off.cqes = offsetof(struct io_rings, cqes);
+	p->cq_off.cqes = dims->cq_offset;
 	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
 	p->cq_off.resv1 = 0;
 	if (!(p->flags & IORING_SETUP_NO_MMAP))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f6c4b141a33d..80228c5a843c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -20,6 +20,7 @@
 struct io_scq_dim {
 	size_t sq_array_offset;
 	size_t sq_size;
+	size_t cq_offset;
 
 	/* Compound array mmap'ed together with CQ. */
 	size_t cq_comp_size;
diff --git a/io_uring/register.c b/io_uring/register.c
index da804f925622..b43a121e2974 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -373,6 +373,7 @@ static int io_register_clock(struct io_ring_ctx *ctx,
 struct io_ring_ctx_rings {
 	struct io_rings *rings;
 	struct io_uring_sqe *sq_sqes;
+	struct io_uring_cqe *cqes;
 
 	struct io_mapped_region sq_region;
 	struct io_mapped_region ring_region;
@@ -439,6 +440,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 		return ret;
 
 	n.rings = io_region_get_ptr(&n.ring_region);
+	n.cqes = io_region_get_ptr(&n.ring_region) + dims.cq_offset;
 
 	/*
 	 * At this point n.rings is shared with userspace, just like o.rings
@@ -497,6 +499,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	ctx->rings = NULL;
 	o.sq_sqes = ctx->sq_sqes;
 	ctx->sq_sqes = NULL;
+	o.cqes = ctx->cqes;
+	ctx->cqes = NULL;
 
 	/*
 	 * Now copy SQ and CQ entries, if any. If either of the destination
@@ -522,6 +526,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 		/* restore old rings, and return -EOVERFLOW via cleanup path */
 		ctx->rings = o.rings;
 		ctx->sq_sqes = o.sq_sqes;
+		ctx->cqes = o.cqes;
 		to_free = &n;
 		ret = -EOVERFLOW;
 		goto out;
@@ -530,7 +535,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 		unsigned src_head = i & (ctx->cq_entries - 1);
 		unsigned dst_head = i & (p.cq_entries - 1);
 
-		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+		n.cqes[dst_head] = o.cqes[src_head];
 	}
 	WRITE_ONCE(n.rings->cq.head, old_head);
 	WRITE_ONCE(n.rings->cq.tail, tail);
@@ -551,6 +556,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 
 	ctx->rings = n.rings;
 	ctx->sq_sqes = n.sq_sqes;
+	ctx->cqes = n.cqes;
 	swap_old(ctx, o, n, ring_region);
 	swap_old(ctx, o, n, sq_region);
 	to_free = &o;
-- 
2.49.0


  parent reply	other threads:[~2025-11-06 17:02 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-06 17:01 [RFC 00/16] Introduce ring flexible placement Pavel Begunkov
2025-11-06 17:01 ` [RFC 01/16] io_uring: add helper calculating region byte size Pavel Begunkov
2025-11-06 17:01 ` [RFC 02/16] io_uring: pass sq entires in the params struct Pavel Begunkov
2025-11-06 17:01 ` [RFC 03/16] io_uring: use mem_is_zero to check ring params Pavel Begunkov
2025-11-06 17:01 ` [RFC 04/16] io_uring: move flags check to io_uring_sanitise_params Pavel Begunkov
2025-11-06 17:01 ` [RFC 05/16] io_uring: introduce struct io_ctx_config Pavel Begunkov
2025-11-06 17:01 ` [RFC 06/16] io_uring: split out config init helper Pavel Begunkov
2025-11-06 17:01 ` [RFC 07/16] io_uring: add structure keeping ring offsets Pavel Begunkov
2025-11-06 17:01 ` [RFC 08/16] io_uring: pre-calculate scq offsets Pavel Begunkov
2025-11-06 17:01 ` [RFC 09/16] io_uring: inroduce helper for setting user offset Pavel Begunkov
2025-11-06 17:01 ` Pavel Begunkov [this message]
2025-11-06 17:01 ` [RFC 11/16] io_uring/region: introduce io_region_slice Pavel Begunkov
2025-11-06 17:01 ` [RFC 12/16] io_uring: convert pointer init to io_region_slice Pavel Begunkov
2025-11-06 17:01 ` [RFC 13/16] io_uring: refactor rings_size() Pavel Begunkov
2025-11-06 17:01 ` [RFC 14/16] io_uring: extract io_create_mem_region Pavel Begunkov
2025-11-06 17:01 ` [RFC 15/16] io_uring: allow creating mem region at setup Pavel Begunkov
2025-11-06 17:01 ` [RFC 16/16] io_uring: introduce SCQ placement Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=274184bd22b625f4420232540ea8801ba4faf98f.1762447538.git.asml.silence@gmail.com \
    --to=asml.silence@gmail.com \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox