public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Pavel Begunkov <asml.silence@gmail.com>
To: io-uring@vger.kernel.org
Cc: asml.silence@gmail.com
Subject: [RFC 16/16] io_uring: introduce SCQ placement
Date: Thu,  6 Nov 2025 17:01:55 +0000	[thread overview]
Message-ID: <e63c0fadab54e7946b0e449c343c3a8fcb2d9358.1762447538.git.asml.silence@gmail.com> (raw)
In-Reply-To: <cover.1762447538.git.asml.silence@gmail.com>

There is a repeated problem with how io_uring manages rings.
Specifically, it creates a new memory region for each ring and places
entries together with headers. As the number of entries is always a
power of 2, it usually means that it needs to allocate an additional
page just for headers, which is wasteful. The headers structure size
is also usually small and under the cache line size, however it's
padded, which might mean additional cache bouncing.

Introduce a way for the user space to overlap SCQ headers and/or rings
onto a pre-registered memory/parameter region. Each of them has a
separate flag / offset, and they'll be attempted to be placed at the
specified offset in the region. If the user doesn't request placement
for SQ and/or CQ, io_uring will create a new memory region for them as
before.

The second goal is to be able to put all components into a single region
while knowing what's placed where. It's specifically interesting for
planned BPF work, as it makes program writing much simpler.

Note: zcrx also have the same issue, but it's left out of this series.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/uapi/linux/io_uring.h |  14 ++++
 io_uring/io_uring.c           | 143 ++++++++++++++++++++++++----------
 io_uring/io_uring.h           |  10 ++-
 io_uring/register.c           |   4 +-
 4 files changed, 128 insertions(+), 43 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2da052bd4138..6574f0c6fc57 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -605,8 +605,22 @@ struct io_uring_params {
 	struct io_cqring_offsets cq_off;
 };
 
+enum io_uring_scq_placement_flags {
+	IORING_PLACEMENT_SCQ_HDR		= (1U << 0),
+	IORING_PLACEMENT_SQ			= (1U << 1),
+	IORING_PLACEMENT_CQ			= (1U << 2),
+};
+
+struct io_uring_scq_placement {
+	__u64 flags;
+	__u64 scq_hdr_off;
+	__u64 sq_off;
+	__u64 cq_off;
+};
+
 struct io_uring_params_ext {
 	__u64 mem_region; /* pointer to struct io_uring_mem_region_reg */
+	struct io_uring_scq_placement placement;
 };
 
 /*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 908c432aaaaa..b5179e444db2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2757,9 +2757,11 @@ static void io_rings_free(struct io_ring_ctx *ctx)
 }
 
 int rings_size(unsigned int flags, unsigned int sq_entries,
-	       unsigned int cq_entries, struct io_scq_dim *dims)
+	       unsigned int cq_entries, struct io_scq_dim *dims,
+	       unsigned placement_flags)
 {
-	size_t cqe_size, off, sqe_size;
+	size_t cqe_size, sqe_size;
+	size_t off = 0;
 
 	if (flags & IORING_SETUP_CQE_MIXED) {
 		if (cq_entries < 2)
@@ -2787,19 +2789,25 @@ int rings_size(unsigned int flags, unsigned int sq_entries,
 	    dims->sq_array_size == SIZE_MAX)
 		return -EOVERFLOW;
 
-	off = sizeof(struct io_rings);
-	off = L1_CACHE_ALIGN(off);
-	dims->cq_offset = off;
+	if (!(placement_flags & IORING_PLACEMENT_SQ))
+		dims->sq_mr_size = dims->sq_size;
 
-	off = size_add(off, dims->cq_size);
-	if (off == SIZE_MAX)
-		return -EOVERFLOW;
+	if (!(placement_flags & IORING_PLACEMENT_SCQ_HDR)) {
+		off = sizeof(struct io_rings);
+		off = L1_CACHE_ALIGN(off);
+	}
+	dims->cq_offset = off;
 
+	if (!(placement_flags & IORING_PLACEMENT_CQ)) {
+		off = size_add(off, dims->cq_size);
+		if (off == SIZE_MAX)
+			return -EOVERFLOW;
 #ifdef CONFIG_SMP
-	off = ALIGN(off, SMP_CACHE_BYTES);
-	if (off == 0)
-		return -EOVERFLOW;
+		off = ALIGN(off, SMP_CACHE_BYTES);
+		if (off == 0)
+			return -EOVERFLOW;
 #endif
+	}
 
 	if (!(flags & IORING_SETUP_NO_SQARRAY)) {
 		dims->sq_array_offset = off;
@@ -2809,7 +2817,7 @@ int rings_size(unsigned int flags, unsigned int sq_entries,
 			return -EOVERFLOW;
 	}
 
-	dims->cq_comp_size = off;
+	dims->rings_mr_size = off;
 	return 0;
 }
 
@@ -3360,12 +3368,47 @@ bool io_is_uring_fops(struct file *file)
 	return file->f_op == &io_uring_fops;
 }
 
+static int io_create_scq_regions(struct io_ring_ctx *ctx,
+				 struct io_ctx_config *config)
+{
+	struct io_scq_dim *dims = &config->dims;
+	struct io_uring_params *p = &config->p;
+	struct io_uring_region_desc rd;
+	int ret;
+
+	if (dims->rings_mr_size) {
+		memset(&rd, 0, sizeof(rd));
+		rd.size = PAGE_ALIGN(dims->rings_mr_size);
+		if (ctx->flags & IORING_SETUP_NO_MMAP) {
+			rd.user_addr = p->cq_off.user_addr;
+			rd.flags |= IORING_MEM_REGION_TYPE_USER;
+		}
+		ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
+		if (ret)
+			return ret;
+	}
+
+	if (dims->sq_mr_size) {
+		memset(&rd, 0, sizeof(rd));
+		rd.size = PAGE_ALIGN(dims->sq_mr_size);
+		if (ctx->flags & IORING_SETUP_NO_MMAP) {
+			rd.user_addr = p->sq_off.user_addr;
+			rd.flags |= IORING_MEM_REGION_TYPE_USER;
+		}
+		ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 					 struct io_ctx_config *config)
 {
+	struct io_uring_scq_placement *pl = &config->ext.placement;
 	struct io_uring_params *p = &config->p;
 	struct io_scq_dim *dims = &config->dims;
-	struct io_uring_region_desc rd;
 	struct io_rings *rings;
 	int ret;
 
@@ -3373,22 +3416,39 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	ctx->sq_entries = p->sq_entries;
 	ctx->cq_entries = p->cq_entries;
 
-	memset(&rd, 0, sizeof(rd));
-	rd.size = PAGE_ALIGN(dims->cq_comp_size);
-	if (ctx->flags & IORING_SETUP_NO_MMAP) {
-		rd.user_addr = p->cq_off.user_addr;
-		rd.flags |= IORING_MEM_REGION_TYPE_USER;
-	}
-	ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
+	ret = io_create_scq_regions(ctx, config);
 	if (ret)
 		return ret;
 
-	ctx->rings = io_region_slice(&ctx->ring_region, 0, sizeof(struct io_rings));
-	ctx->cqes = io_region_slice(&ctx->ring_region, dims->cq_offset, dims->cq_size);
-	if (!ctx->rings || !ctx->cqes)
-		return -EFAULT;
+	if (pl->flags & IORING_PLACEMENT_SQ) {
+		ctx->sq_sqes = io_region_slice(&ctx->param_region,
+						pl->sq_off, dims->sq_size);
+	} else {
+		ctx->sq_sqes = io_region_slice(&ctx->sq_region,
+						0, dims->sq_size);
+	}
+
+	if (pl->flags & IORING_PLACEMENT_SCQ_HDR) {
+		ctx->rings = io_region_slice(&ctx->param_region,
+					     pl->scq_hdr_off,
+					     sizeof(struct io_rings));
+	} else {
+		ctx->rings = io_region_slice(&ctx->ring_region,
+					     0, sizeof(struct io_rings));
+	}
+
+	if (pl->flags & IORING_PLACEMENT_CQ) {
+		ctx->cqes = io_region_slice(&ctx->param_region,
+					    pl->cq_off, dims->cq_size);
+	} else {
+		ctx->cqes = io_region_slice(&ctx->ring_region,
+					    dims->cq_offset, dims->cq_size);
+	}
 
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
+		if (WARN_ON_ONCE(pl->flags & IORING_PLACEMENT_CQ))
+			return -EFAULT;
+
 		ctx->sq_array = io_region_slice(&ctx->ring_region,
 						dims->sq_array_offset,
 						dims->sq_array_size);
@@ -3396,20 +3456,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 			return -EFAULT;
 	}
 
-	memset(&rd, 0, sizeof(rd));
-	rd.size = PAGE_ALIGN(dims->sq_size);
-	if (ctx->flags & IORING_SETUP_NO_MMAP) {
-		rd.user_addr = p->sq_off.user_addr;
-		rd.flags |= IORING_MEM_REGION_TYPE_USER;
-	}
-	ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
-	if (ret) {
-		io_rings_free(ctx);
-		return ret;
-	}
-
-	ctx->sq_sqes = io_region_slice(&ctx->sq_region, 0, dims->sq_size);
-	if (!ctx->sq_sqes)
+	if (!ctx->sq_sqes || !ctx->cqes || !ctx->rings)
 		return -EFAULT;
 
 	rings = ctx->rings;
@@ -3575,6 +3622,8 @@ static int io_prepare_config(struct io_ctx_config *config)
 {
 	struct io_uring_params *p = &config->p;
 	struct io_uring_params_ext __user *ext_user;
+	struct io_uring_params_ext *e = &config->ext;
+	struct io_uring_scq_placement *pl = &e->placement;
 	int ret;
 
 	ext_user = u64_to_user_ptr(config->p.params_ext);
@@ -3589,10 +3638,26 @@ static int io_prepare_config(struct io_ctx_config *config)
 	if (ret)
 		return ret;
 
-	ret = rings_size(p->flags, p->sq_entries, p->cq_entries, &config->dims);
+	ret = rings_size(p->flags, p->sq_entries, p->cq_entries, &config->dims,
+			 pl->flags);
 	if (ret)
 		return ret;
 
+	if (pl->flags) {
+		if (pl->flags & ~IORING_PLACEMENT_MASK)
+			return -EOPNOTSUPP;
+		/* requires a registered memory region */
+		if (!e->mem_region)
+			return -EINVAL;
+		/* SQ arrays are not supported for simplicity */
+		if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+			return -EINVAL;
+		/* don't allow creating a new region with just for headers */
+		if ((pl->flags & IORING_PLACEMENT_CQ) &&
+		     !(pl->flags & IORING_PLACEMENT_SCQ_HDR))
+			return -EINVAL;
+	}
+
 	io_fill_scq_offsets(p, &config->dims);
 	return 0;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c883017b11d3..307710464cc4 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -25,7 +25,8 @@ struct io_scq_dim {
 	size_t cq_size;
 
 	/* Compound array mmap'ed together with CQ. */
-	size_t cq_comp_size;
+	size_t rings_mr_size;
+	size_t sq_mr_size;
 };
 
 struct io_ctx_config {
@@ -35,6 +36,10 @@ struct io_ctx_config {
 	struct io_uring_params __user *uptr;
 };
 
+#define IORING_PLACEMENT_MASK (IORING_PLACEMENT_SCQ_HDR |\
+				IORING_PLACEMENT_SQ |\
+				IORING_PLACEMENT_CQ)
+
 #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\
 			IORING_FEAT_NODROP |\
 			IORING_FEAT_SUBMIT_STABLE |\
@@ -153,7 +158,8 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 
 int rings_size(unsigned int flags, unsigned int sq_entries,
-	       unsigned int cq_entries, struct io_scq_dim *dims);
+	       unsigned int cq_entries, struct io_scq_dim *dims,
+	       unsigned placement_flags);
 int io_uring_fill_params(struct io_uring_params *p);
 void io_fill_scq_offsets(struct io_uring_params *p, struct io_scq_dim *dims);
 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32);
diff --git a/io_uring/register.c b/io_uring/register.c
index 4affabc416aa..bbcb5a79a35f 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -423,12 +423,12 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	ret = io_uring_fill_params(&p);
 	if (unlikely(ret))
 		return ret;
-	ret = rings_size(p.flags, p.sq_entries, p.cq_entries, &dims);
+	ret = rings_size(p.flags, p.sq_entries, p.cq_entries, &dims, 0);
 	if (ret)
 		return ret;
 	io_fill_scq_offsets(&p, &dims);
 
-	size = dims.cq_comp_size;
+	size = dims.rings_mr_size;
 	sq_array_offset = dims.sq_array_offset;
 
 	memset(&rd, 0, sizeof(rd));
-- 
2.49.0


      parent reply	other threads:[~2025-11-06 17:02 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-06 17:01 [RFC 00/16] Introduce ring flexible placement Pavel Begunkov
2025-11-06 17:01 ` [RFC 01/16] io_uring: add helper calculating region byte size Pavel Begunkov
2025-11-06 17:01 ` [RFC 02/16] io_uring: pass sq entires in the params struct Pavel Begunkov
2025-11-06 17:01 ` [RFC 03/16] io_uring: use mem_is_zero to check ring params Pavel Begunkov
2025-11-06 17:01 ` [RFC 04/16] io_uring: move flags check to io_uring_sanitise_params Pavel Begunkov
2025-11-06 17:01 ` [RFC 05/16] io_uring: introduce struct io_ctx_config Pavel Begunkov
2025-11-06 17:01 ` [RFC 06/16] io_uring: split out config init helper Pavel Begunkov
2025-11-06 17:01 ` [RFC 07/16] io_uring: add structure keeping ring offsets Pavel Begunkov
2025-11-06 17:01 ` [RFC 08/16] io_uring: pre-calculate scq offsets Pavel Begunkov
2025-11-06 17:01 ` [RFC 09/16] io_uring: inroduce helper for setting user offset Pavel Begunkov
2025-11-06 17:01 ` [RFC 10/16] io_uring: separate cqe array from headers Pavel Begunkov
2025-11-06 17:01 ` [RFC 11/16] io_uring/region: introduce io_region_slice Pavel Begunkov
2025-11-06 17:01 ` [RFC 12/16] io_uring: convert pointer init to io_region_slice Pavel Begunkov
2025-11-06 17:01 ` [RFC 13/16] io_uring: refactor rings_size() Pavel Begunkov
2025-11-06 17:01 ` [RFC 14/16] io_uring: extract io_create_mem_region Pavel Begunkov
2025-11-06 17:01 ` [RFC 15/16] io_uring: allow creating mem region at setup Pavel Begunkov
2025-11-06 17:01 ` Pavel Begunkov [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e63c0fadab54e7946b0e449c343c3a8fcb2d9358.1762447538.git.asml.silence@gmail.com \
    --to=asml.silence@gmail.com \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox