* [PATCH v4 01/12] io_uring/zcrx: remove sync refill uapi
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
@ 2025-11-03 23:40 ` David Wei
2025-11-04 13:19 ` Pavel Begunkov
2025-11-03 23:41 ` [PATCH v4 02/12] io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL David Wei
` (10 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: David Wei @ 2025-11-03 23:40 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
From: Pavel Begunkov <asml.silence@gmail.com>
There is a better way to handle the problem IORING_REGISTER_ZCRX_REFILL
solves. Disable it for now and remove relevant uapi, it'll be reworked
for next release.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
include/uapi/linux/io_uring.h | 12 ------------
io_uring/register.c | 3 ---
io_uring/zcrx.c | 11 +++++++++++
3 files changed, 11 insertions(+), 15 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 04797a9b76bc..e96080db3e4d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -697,9 +697,6 @@ enum io_uring_register_op {
/* query various aspects of io_uring, see linux/io_uring/query.h */
IORING_REGISTER_QUERY = 35,
- /* return zcrx buffers back into circulation */
- IORING_REGISTER_ZCRX_REFILL = 36,
-
/* this goes last */
IORING_REGISTER_LAST,
@@ -1081,15 +1078,6 @@ struct io_uring_zcrx_ifq_reg {
__u64 __resv[3];
};
-struct io_uring_zcrx_sync_refill {
- __u32 zcrx_id;
- /* the number of entries to return */
- __u32 nr_entries;
- /* pointer to an array of struct io_uring_zcrx_rqe */
- __u64 rqes;
- __u64 __resv[2];
-};
-
#ifdef __cplusplus
}
#endif
diff --git a/io_uring/register.c b/io_uring/register.c
index 1a3e05be6e7b..d8ce1b5cc3a2 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -826,9 +826,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_QUERY:
ret = io_query(ctx, arg, nr_args);
break;
- case IORING_REGISTER_ZCRX_REFILL:
- ret = io_zcrx_return_bufs(ctx, arg, nr_args);
- break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index a816f5902091..b694fa582d29 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -931,6 +931,16 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16)
#define IO_ZCRX_SYS_REFILL_BATCH 32
+struct io_uring_zcrx_sync_refill {
+ __u32 zcrx_id;
+ /* the number of entries to return */
+ __u32 nr_entries;
+ /* pointer to an array of struct io_uring_zcrx_rqe */
+ __u64 rqes;
+ __u64 __resv[2];
+};
+
+
static void io_return_buffers(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_rqe *rqes, unsigned nr)
{
@@ -955,6 +965,7 @@ static void io_return_buffers(struct io_zcrx_ifq *ifq,
}
}
+__maybe_unused
int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
void __user *arg, unsigned nr_arg)
{
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v4 01/12] io_uring/zcrx: remove sync refill uapi
2025-11-03 23:40 ` [PATCH v4 01/12] io_uring/zcrx: remove sync refill uapi David Wei
@ 2025-11-04 13:19 ` Pavel Begunkov
0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-11-04 13:19 UTC (permalink / raw)
To: David Wei, io-uring, netdev; +Cc: Jens Axboe
On 11/3/25 23:40, David Wei wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
>
> There is a better way to handle the problem IORING_REGISTER_ZCRX_REFILL
> solves. Disable it for now and remove relevant uapi, it'll be reworked
> for next release.
You don't need to carry the first two patches. I sent it out properly,
and once it's propagated to for-6.19, this set would need to be
rebased on top.
In the meantime, can you send 3-9 separately? They solve a real
problem, and it'll be easier to merge the rest after as well.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH v4 02/12] io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
2025-11-03 23:40 ` [PATCH v4 01/12] io_uring/zcrx: remove sync refill uapi David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 03/12] io_uring/memmap: remove unneeded io_ring_ctx arg David Wei
` (9 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
From: Pavel Begunkov <asml.silence@gmail.com>
Introduce IORING_REGISTER_ZCRX_CTRL and add some boilerplate code
forwarding it to zcrx. There are no actual users in this patch, it'll be
used for refill queue flushing and other features.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
include/uapi/linux/io_uring.h | 13 +++++++++++++
io_uring/register.c | 3 +++
io_uring/zcrx.c | 21 ++++++++++++++++++++-
io_uring/zcrx.h | 7 +++----
4 files changed, 39 insertions(+), 5 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e96080db3e4d..8b4935b983e7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -697,6 +697,9 @@ enum io_uring_register_op {
/* query various aspects of io_uring, see linux/io_uring/query.h */
IORING_REGISTER_QUERY = 35,
+ /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
+ IORING_REGISTER_ZCRX_CTRL = 36,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg {
__u64 __resv[3];
};
+enum zcrx_ctrl_op {
+ __ZCRX_CTRL_LAST,
+};
+
+struct zcrx_ctrl {
+ __u32 zcrx_id;
+ __u32 op; /* see enum zcrx_ctrl_op */
+ __u64 resv[8];
+};
+
#ifdef __cplusplus
}
#endif
diff --git a/io_uring/register.c b/io_uring/register.c
index d8ce1b5cc3a2..38b20a7a34db 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -826,6 +826,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_QUERY:
ret = io_query(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_ZCRX_CTRL:
+ ret = io_zcrx_ctrl(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index b694fa582d29..3e9d8333a301 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -541,6 +541,25 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
return ifq ? &ifq->region : NULL;
}
+int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+{
+ struct zcrx_ctrl ctrl;
+ struct io_zcrx_ifq *ifq;
+
+ if (nr_args)
+ return -EINVAL;
+ if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
+ return -EFAULT;
+ if (ctrl.op >= __ZCRX_CTRL_LAST)
+ return -EOPNOTSUPP;
+
+ ifq = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
+ if (!ifq)
+ return -ENXIO;
+
+ return -EINVAL;
+}
+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -966,7 +985,7 @@ static void io_return_buffers(struct io_zcrx_ifq *ifq,
}
__maybe_unused
-int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
+static int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
void __user *arg, unsigned nr_arg)
{
struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH];
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 33ef61503092..d7bef619e8ad 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -63,8 +63,7 @@ struct io_zcrx_ifq {
};
#if defined(CONFIG_IO_URING_ZCRX)
-int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
- void __user *arg, unsigned nr_arg);
+int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
@@ -97,8 +96,8 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct
{
return NULL;
}
-static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx,
- void __user *arg, unsigned nr_arg)
+static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned nr_arg)
{
return -EOPNOTSUPP;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 03/12] io_uring/memmap: remove unneeded io_ring_ctx arg
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
2025-11-03 23:40 ` [PATCH v4 01/12] io_uring/zcrx: remove sync refill uapi David Wei
2025-11-03 23:41 ` [PATCH v4 02/12] io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 04/12] io_uring/memmap: refactor io_free_region() to take user_struct param David Wei
` (8 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
Remove io_ring_ctx arg from io_region_pin_pages() and
io_region_allocate_pages() that isn't used.
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/memmap.c | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index aa388ecd4754..d1318079c337 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -131,9 +131,8 @@ static int io_region_init_ptr(struct io_mapped_region *mr)
return 0;
}
-static int io_region_pin_pages(struct io_ring_ctx *ctx,
- struct io_mapped_region *mr,
- struct io_uring_region_desc *reg)
+static int io_region_pin_pages(struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg)
{
unsigned long size = mr->nr_pages << PAGE_SHIFT;
struct page **pages;
@@ -150,8 +149,7 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx,
return 0;
}
-static int io_region_allocate_pages(struct io_ring_ctx *ctx,
- struct io_mapped_region *mr,
+static int io_region_allocate_pages(struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
{
@@ -219,9 +217,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
mr->nr_pages = nr_pages;
if (reg->flags & IORING_MEM_REGION_TYPE_USER)
- ret = io_region_pin_pages(ctx, mr, reg);
+ ret = io_region_pin_pages(mr, reg);
else
- ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset);
+ ret = io_region_allocate_pages(mr, reg, mmap_offset);
if (ret)
goto out_free;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 04/12] io_uring/memmap: refactor io_free_region() to take user_struct param
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (2 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 03/12] io_uring/memmap: remove unneeded io_ring_ctx arg David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 05/12] io_uring/rsrc: refactor io_{un}account_mem() to take {user,mm}_struct param David Wei
` (7 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
Refactor io_free_region() to take user_struct directly, instead of
accessing it from the ring ctx.
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/io_uring.c | 6 +++---
io_uring/kbuf.c | 4 ++--
io_uring/memmap.c | 8 ++++----
io_uring/memmap.h | 2 +-
io_uring/register.c | 6 +++---
io_uring/zcrx.c | 2 +-
6 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 200b6c4bb2cc..7d42748774f8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2798,8 +2798,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
static void io_rings_free(struct io_ring_ctx *ctx)
{
- io_free_region(ctx, &ctx->sq_region);
- io_free_region(ctx, &ctx->ring_region);
+ io_free_region(ctx->user, &ctx->sq_region);
+ io_free_region(ctx->user, &ctx->ring_region);
ctx->rings = NULL;
ctx->sq_sqes = NULL;
}
@@ -2884,7 +2884,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_eventfd_unregister(ctx);
io_free_alloc_caches(ctx);
io_destroy_buffers(ctx);
- io_free_region(ctx, &ctx->param_region);
+ io_free_region(ctx->user, &ctx->param_region);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index c034c90396bc..8a329556f8df 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -428,7 +428,7 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
if (bl->flags & IOBL_BUF_RING)
- io_free_region(ctx, &bl->region);
+ io_free_region(ctx->user, &bl->region);
else
io_remove_buffers_legacy(ctx, bl, -1U);
@@ -672,7 +672,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
fail:
- io_free_region(ctx, &bl->region);
+ io_free_region(ctx->user, &bl->region);
kfree(bl);
return ret;
}
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index d1318079c337..b1054fe94568 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -88,7 +88,7 @@ enum {
IO_REGION_F_SINGLE_REF = 4,
};
-void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
{
if (mr->pages) {
long nr_refs = mr->nr_pages;
@@ -105,8 +105,8 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
}
if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr)
vunmap(mr->ptr);
- if (mr->nr_pages && ctx->user)
- __io_unaccount_mem(ctx->user, mr->nr_pages);
+ if (mr->nr_pages && user)
+ __io_unaccount_mem(user, mr->nr_pages);
memset(mr, 0, sizeof(*mr));
}
@@ -228,7 +228,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
goto out_free;
return 0;
out_free:
- io_free_region(ctx, mr);
+ io_free_region(ctx->user, mr);
return ret;
}
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index 58002976e0c3..a7c476f499d5 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -16,7 +16,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long flags);
int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
-void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
+void io_free_region(struct user_struct *user, struct io_mapped_region *mr);
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset);
diff --git a/io_uring/register.c b/io_uring/register.c
index 38b20a7a34db..244d523d1bad 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -381,8 +381,8 @@ struct io_ring_ctx_rings {
static void io_register_free_rings(struct io_ring_ctx *ctx,
struct io_ring_ctx_rings *r)
{
- io_free_region(ctx, &r->sq_region);
- io_free_region(ctx, &r->ring_region);
+ io_free_region(ctx->user, &r->sq_region);
+ io_free_region(ctx->user, &r->ring_region);
}
#define swap_old(ctx, o, n, field) \
@@ -604,7 +604,7 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
if (ret)
return ret;
if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
- io_free_region(ctx, ®ion);
+ io_free_region(ctx->user, ®ion);
return -EFAULT;
}
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 3e9d8333a301..ec0a76b4f199 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -378,7 +378,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
- io_free_region(ifq->ctx, &ifq->region);
+ io_free_region(ifq->ctx->user, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 05/12] io_uring/rsrc: refactor io_{un}account_mem() to take {user,mm}_struct param
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (3 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 04/12] io_uring/memmap: refactor io_free_region() to take user_struct param David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 06/12] io_uring/zcrx: add io_zcrx_ifq arg to io_zcrx_free_area() David Wei
` (6 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
Refactor io_{un}account_mem() to take user_struct and mm_struct
directly, instead of accessing it from the ring ctx.
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/rsrc.c | 26 ++++++++++++++------------
io_uring/rsrc.h | 6 ++++--
io_uring/zcrx.c | 5 +++--
3 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d787c16dc1c3..59135fe84082 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -56,27 +56,29 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
return 0;
}
-void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account,
+ unsigned long nr_pages)
{
- if (ctx->user)
- __io_unaccount_mem(ctx->user, nr_pages);
+ if (user)
+ __io_unaccount_mem(user, nr_pages);
- if (ctx->mm_account)
- atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
+ if (mm_account)
+ atomic64_sub(nr_pages, &mm_account->pinned_vm);
}
-int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+int io_account_mem(struct user_struct *user, struct mm_struct *mm_account,
+ unsigned long nr_pages)
{
int ret;
- if (ctx->user) {
- ret = __io_account_mem(ctx->user, nr_pages);
+ if (user) {
+ ret = __io_account_mem(user, nr_pages);
if (ret)
return ret;
}
- if (ctx->mm_account)
- atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
+ if (mm_account)
+ atomic64_add(nr_pages, &mm_account->pinned_vm);
return 0;
}
@@ -145,7 +147,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
}
if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages);
+ io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
imu->release(imu->priv);
io_free_imu(ctx, imu);
}
@@ -684,7 +686,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
if (!imu->acct_pages)
return 0;
- ret = io_account_mem(ctx, imu->acct_pages);
+ ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
if (ret)
imu->acct_pages = 0;
return ret;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index a3ca6ba66596..d603f6a47f5e 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -120,8 +120,10 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
-int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages);
-void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages);
+int io_account_mem(struct user_struct *user, struct mm_struct *mm_account,
+ unsigned long nr_pages);
+void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account,
+ unsigned long nr_pages);
static inline void __io_unaccount_mem(struct user_struct *user,
unsigned long nr_pages)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index ec0a76b4f199..ac9abfd54799 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -200,7 +200,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
}
mem->account_pages = io_count_account_pages(pages, nr_pages);
- ret = io_account_mem(ifq->ctx, mem->account_pages);
+ ret = io_account_mem(ifq->ctx->user, ifq->ctx->mm_account, mem->account_pages);
if (ret < 0)
mem->account_pages = 0;
@@ -389,7 +389,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
io_release_area_mem(&area->mem);
if (area->mem.account_pages)
- io_unaccount_mem(area->ifq->ctx, area->mem.account_pages);
+ io_unaccount_mem(area->ifq->ctx->user, area->ifq->ctx->mm_account,
+ area->mem.account_pages);
kvfree(area->freelist);
kvfree(area->nia.niovs);
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 06/12] io_uring/zcrx: add io_zcrx_ifq arg to io_zcrx_free_area()
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (4 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 05/12] io_uring/rsrc: refactor io_{un}account_mem() to take {user,mm}_struct param David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 07/12] io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq David Wei
` (5 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
Add io_zcrx_ifq arg to io_zcrx_free_area(). A QOL change to reduce line
widths.
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/zcrx.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index ac9abfd54799..5dd93e4e0ee7 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -383,9 +383,10 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
ifq->rqes = NULL;
}
-static void io_zcrx_free_area(struct io_zcrx_area *area)
+static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area)
{
- io_zcrx_unmap_area(area->ifq, area);
+ io_zcrx_unmap_area(ifq, area);
io_release_area_mem(&area->mem);
if (area->mem.account_pages)
@@ -464,7 +465,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
return 0;
err:
if (area)
- io_zcrx_free_area(area);
+ io_zcrx_free_area(ifq, area);
return ret;
}
@@ -523,7 +524,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
io_close_queue(ifq);
if (ifq->area)
- io_zcrx_free_area(ifq->area);
+ io_zcrx_free_area(ifq, ifq->area);
if (ifq->dev)
put_device(ifq->dev);
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 07/12] io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (5 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 06/12] io_uring/zcrx: add io_zcrx_ifq arg to io_zcrx_free_area() David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 08/12] io_uring/zcrx: move io_unregister_zcrx_ifqs() down David Wei
` (4 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
In preparation for removing ifq->ctx and making ifq lifetime independent
of ring ctx, add user_struct and mm_struct to io_zcrx_ifq.
In the ifq cleanup path, these are the only fields used from the main
ring ctx to do accounting. Taking a copy in the ifq allows ifq->ctx to
be removed later, including the ctx->refs held by the ifq.
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/zcrx.c | 24 ++++++++++++++++++------
io_uring/zcrx.h | 2 ++
2 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 5dd93e4e0ee7..dcf5297c0330 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -200,7 +200,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
}
mem->account_pages = io_count_account_pages(pages, nr_pages);
- ret = io_account_mem(ifq->ctx->user, ifq->ctx->mm_account, mem->account_pages);
+ ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
if (ret < 0)
mem->account_pages = 0;
@@ -344,7 +344,8 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
-static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
+static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
+ struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
struct io_uring_region_desc *rd,
u32 id)
@@ -362,7 +363,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
mmap_offset += id << IORING_OFF_PBUF_SHIFT;
- ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
+ ret = io_create_region(ctx, &ifq->region, rd, mmap_offset);
if (ret < 0)
return ret;
@@ -378,7 +379,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
- io_free_region(ifq->ctx->user, &ifq->region);
+ io_free_region(ifq->user, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
@@ -390,7 +391,7 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
io_release_area_mem(&area->mem);
if (area->mem.account_pages)
- io_unaccount_mem(area->ifq->ctx->user, area->ifq->ctx->mm_account,
+ io_unaccount_mem(ifq->user, ifq->mm_account,
area->mem.account_pages);
kvfree(area->freelist);
@@ -525,6 +526,9 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
if (ifq->area)
io_zcrx_free_area(ifq, ifq->area);
+ free_uid(ifq->user);
+ if (ifq->mm_account)
+ mmdrop(ifq->mm_account);
if (ifq->dev)
put_device(ifq->dev);
@@ -607,6 +611,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
+ if (ctx->user) {
+ get_uid(ctx->user);
+ ifq->user = ctx->user;
+ }
+ if (ctx->mm_account) {
+ mmgrab(ctx->mm_account);
+ ifq->mm_account = ctx->mm_account;
+ }
ifq->rq_entries = reg.rq_entries;
scoped_guard(mutex, &ctx->mmap_lock) {
@@ -616,7 +628,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto ifq_free;
}
- ret = io_allocate_rbuf_ring(ifq, ®, &rd, id);
+ ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id);
if (ret)
goto err;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index d7bef619e8ad..2396436643e5 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -42,6 +42,8 @@ struct io_zcrx_ifq {
struct io_ring_ctx *ctx;
struct io_zcrx_area *area;
unsigned niov_shift;
+ struct user_struct *user;
+ struct mm_struct *mm_account;
spinlock_t rq_lock ____cacheline_aligned_in_smp;
struct io_uring *rq_ring;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 08/12] io_uring/zcrx: move io_unregister_zcrx_ifqs() down
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (6 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 07/12] io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 09/12] io_uring/zcrx: reverse ifq refcount David Wei
` (3 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
In preparation for removing the ref on ctx->refs held by an ifq and
removing io_shutdown_zcrx_ifqs(), move io_unregister_zcrx_ifqs() down
such that it can call io_zcrx_scrub().
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/zcrx.c | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index dcf5297c0330..bb5cc6ec5b9b 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -681,28 +681,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return ret;
}
-void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
-{
- struct io_zcrx_ifq *ifq;
-
- lockdep_assert_held(&ctx->uring_lock);
-
- while (1) {
- scoped_guard(mutex, &ctx->mmap_lock) {
- unsigned long id = 0;
-
- ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
- if (ifq)
- xa_erase(&ctx->zcrx_ctxs, id);
- }
- if (!ifq)
- break;
- io_zcrx_ifq_free(ifq);
- }
-
- xa_destroy(&ctx->zcrx_ctxs);
-}
-
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
{
unsigned niov_idx;
@@ -768,6 +746,28 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
}
}
+void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+ struct io_zcrx_ifq *ifq;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ while (1) {
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ unsigned long id = 0;
+
+ ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+ if (ifq)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ }
+ if (!ifq)
+ break;
+ io_zcrx_ifq_free(ifq);
+ }
+
+ xa_destroy(&ctx->zcrx_ctxs);
+}
+
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
{
u32 entries;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 09/12] io_uring/zcrx: reverse ifq refcount
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (7 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 08/12] io_uring/zcrx: move io_unregister_zcrx_ifqs() down David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-04 13:38 ` Pavel Begunkov
2025-11-03 23:41 ` [PATCH v4 10/12] io_uring/zcrx: move io_zcrx_scrub() and dependencies up David Wei
` (2 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
Add two refcounts to struct io_zcrx_ifq to reverse the refcounting
relationship i.e. rings now reference ifqs instead. As a result of this,
remove ctx->refs that an ifq holds on a ring via the page pool memory
provider.
The first ref is ifq->refs, held by internal users of an ifq, namely
rings and the page pool memory provider associated with an ifq. This is
needed to keep the ifq around until the page pool is destroyed.
The second ref is ifq->user_refs, held by userspace facing users like
rings. For now, only the ring that created the ifq will have a ref, but
with ifq sharing added, this will include multiple rings.
ifq->refs will be 1 larger than ifq->user_refs, with the extra ref held
by the page pool. Once ifq->user_refs falls to 0, the ifq is cleaned up
including destroying the page pool. Once the page pool is destroyed,
ifq->refs will fall to 0 and free the ifq.
Since ifqs now no longer hold refs to ring ctx, there isn't a need to
split the cleanup of ifqs into two: io_shutdown_zcrx_ifqs() in
io_ring_exit_work() while waiting for ctx->refs to drop to 0, and
io_unregister_zcrx_ifqs() after. Remove io_shutdown_zcrx_ifqs().
Signed-off-by: David Wei <dw@davidwei.uk>
Co-developed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/io_uring.c | 5 -----
io_uring/zcrx.c | 36 +++++++++++++++++-------------------
io_uring/zcrx.h | 8 +++-----
3 files changed, 20 insertions(+), 29 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7d42748774f8..8af5efda9c11 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3042,11 +3042,6 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
- if (!xa_empty(&ctx->zcrx_ctxs)) {
- mutex_lock(&ctx->uring_lock);
- io_shutdown_zcrx_ifqs(ctx);
- mutex_unlock(&ctx->uring_lock);
- }
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index bb5cc6ec5b9b..00498e3dcbd3 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -479,9 +479,10 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
return NULL;
ifq->if_rxq = -1;
- ifq->ctx = ctx;
spin_lock_init(&ifq->rq_lock);
mutex_init(&ifq->pp_lock);
+ refcount_set(&ifq->refs, 1);
+ refcount_set(&ifq->user_refs, 1);
return ifq;
}
@@ -537,6 +538,12 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
kfree(ifq);
}
+static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
+{
+ if (refcount_dec_and_test(&ifq->refs))
+ io_zcrx_ifq_free(ifq);
+}
+
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
@@ -611,6 +618,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
+
if (ctx->user) {
get_uid(ctx->user);
ifq->user = ctx->user;
@@ -733,19 +741,6 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
}
}
-void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
-{
- struct io_zcrx_ifq *ifq;
- unsigned long index;
-
- lockdep_assert_held(&ctx->uring_lock);
-
- xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
- io_zcrx_scrub(ifq);
- io_close_queue(ifq);
- }
-}
-
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
@@ -762,7 +757,12 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
}
if (!ifq)
break;
- io_zcrx_ifq_free(ifq);
+
+ if (refcount_dec_and_test(&ifq->user_refs)) {
+ io_close_queue(ifq);
+ io_zcrx_scrub(ifq);
+ }
+ io_put_zcrx_ifq(ifq);
}
xa_destroy(&ctx->zcrx_ctxs);
@@ -913,15 +913,13 @@ static int io_pp_zc_init(struct page_pool *pp)
if (ret)
return ret;
- percpu_ref_get(&ifq->ctx->refs);
+ refcount_inc(&ifq->refs);
return 0;
}
static void io_pp_zc_destroy(struct page_pool *pp)
{
- struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
-
- percpu_ref_put(&ifq->ctx->refs);
+ io_put_zcrx_ifq(io_pp_to_ifq(pp));
}
static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 2396436643e5..9014a1fd0f61 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -39,7 +39,6 @@ struct io_zcrx_area {
};
struct io_zcrx_ifq {
- struct io_ring_ctx *ctx;
struct io_zcrx_area *area;
unsigned niov_shift;
struct user_struct *user;
@@ -55,6 +54,9 @@ struct io_zcrx_ifq {
struct device *dev;
struct net_device *netdev;
netdevice_tracker netdev_tracker;
+ refcount_t refs;
+ /* counts userspace facing users like io_uring */
+ refcount_t user_refs;
/*
* Page pool and net configuration lock, can be taken deeper in the
@@ -69,7 +71,6 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
-void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
@@ -84,9 +85,6 @@ static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
}
-static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
-{
-}
static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len)
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v4 09/12] io_uring/zcrx: reverse ifq refcount
2025-11-03 23:41 ` [PATCH v4 09/12] io_uring/zcrx: reverse ifq refcount David Wei
@ 2025-11-04 13:38 ` Pavel Begunkov
0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-11-04 13:38 UTC (permalink / raw)
To: David Wei, io-uring, netdev; +Cc: Jens Axboe
On 11/3/25 23:41, David Wei wrote:
> Add two refcounts to struct io_zcrx_ifq to reverse the refcounting
> relationship i.e. rings now reference ifqs instead. As a result of this,
Note, you don't need the 2nd refcount in this patch as there is
only one io_uring using it. I hope not, but there is a chance
we might need to backport it, which is why it's midly preferably
to be kept separate.
> remove ctx->refs that an ifq holds on a ring via the page pool memory
> provider.
Nice!
> The first ref is ifq->refs, held by internal users of an ifq, namely
> rings and the page pool memory provider associated with an ifq. This is
> needed to keep the ifq around until the page pool is destroyed.
>
> The second ref is ifq->user_refs, held by userspace facing users like
> rings. For now, only the ring that created the ifq will have a ref, but
> with ifq sharing added, this will include multiple rings.
>
> ifq->refs will be 1 larger than ifq->user_refs, with the extra ref held
Can be larger than +1 as there might be multiple page pools
referring to it.
> by the page pool. Once ifq->user_refs falls to 0, the ifq is cleaned up
> including destroying the page pool. Once the page pool is destroyed,
> ifq->refs will fall to 0 and free the ifq.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH v4 10/12] io_uring/zcrx: move io_zcrx_scrub() and dependencies up
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (8 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 09/12] io_uring/zcrx: reverse ifq refcount David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 11/12] io_uring/zcrx: export zcrx via a file David Wei
2025-11-03 23:41 ` [PATCH v4 12/12] io_uring/zcrx: share an ifq between rings David Wei
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
In preparation for adding zcrx ifq exporting and importing, move
io_zcrx_scrub() and its dependencies up the file to be closer to
io_close_queue().
Signed-off-by: David Wei <dw@davidwei.uk>
---
io_uring/zcrx.c | 84 ++++++++++++++++++++++++-------------------------
1 file changed, 42 insertions(+), 42 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 00498e3dcbd3..e9981478bcf6 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -544,6 +544,48 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
io_zcrx_ifq_free(ifq);
}
+static void io_zcrx_return_niov_freelist(struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ spin_lock_bh(&area->freelist_lock);
+ area->freelist[area->free_count++] = net_iov_idx(niov);
+ spin_unlock_bh(&area->freelist_lock);
+}
+
+static void io_zcrx_return_niov(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ if (!niov->pp) {
+ /* copy fallback allocated niovs */
+ io_zcrx_return_niov_freelist(niov);
+ return;
+ }
+ page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
+}
+
+static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
+{
+ struct io_zcrx_area *area = ifq->area;
+ int i;
+
+ if (!area)
+ return;
+
+ /* Reclaim back all buffers given to the user space. */
+ for (i = 0; i < area->nia.num_niovs; i++) {
+ struct net_iov *niov = &area->nia.niovs[i];
+ int nr;
+
+ if (!atomic_read(io_get_user_counter(niov)))
+ continue;
+ nr = atomic_xchg(io_get_user_counter(niov), 0);
+ if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
+ io_zcrx_return_niov(niov);
+ }
+}
+
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
@@ -699,48 +741,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
return &area->nia.niovs[niov_idx];
}
-static void io_zcrx_return_niov_freelist(struct net_iov *niov)
-{
- struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
- spin_lock_bh(&area->freelist_lock);
- area->freelist[area->free_count++] = net_iov_idx(niov);
- spin_unlock_bh(&area->freelist_lock);
-}
-
-static void io_zcrx_return_niov(struct net_iov *niov)
-{
- netmem_ref netmem = net_iov_to_netmem(niov);
-
- if (!niov->pp) {
- /* copy fallback allocated niovs */
- io_zcrx_return_niov_freelist(niov);
- return;
- }
- page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
-}
-
-static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
-{
- struct io_zcrx_area *area = ifq->area;
- int i;
-
- if (!area)
- return;
-
- /* Reclaim back all buffers given to the user space. */
- for (i = 0; i < area->nia.num_niovs; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- int nr;
-
- if (!atomic_read(io_get_user_counter(niov)))
- continue;
- nr = atomic_xchg(io_get_user_counter(niov), 0);
- if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
- io_zcrx_return_niov(niov);
- }
-}
-
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 11/12] io_uring/zcrx: export zcrx via a file
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (9 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 10/12] io_uring/zcrx: move io_zcrx_scrub() and dependencies up David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-03 23:41 ` [PATCH v4 12/12] io_uring/zcrx: share an ifq between rings David Wei
11 siblings, 0 replies; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
From: Pavel Begunkov <asml.silence@gmail.com>
Add an option to wrap a zcrx instance into a file and expose it to the
user space. Currently, users can't do anything meaningful with the file,
but it'll be used in a next patch to import it into another io_uring
instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the
IORING_REGISTER_ZCRX_CTRL registration opcode.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
include/uapi/linux/io_uring.h | 2 ++
io_uring/zcrx.c | 62 +++++++++++++++++++++++++++++++----
2 files changed, 58 insertions(+), 6 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8b4935b983e7..34bd32402902 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1082,6 +1082,8 @@ struct io_uring_zcrx_ifq_reg {
};
enum zcrx_ctrl_op {
+ ZCRX_CTRL_EXPORT,
+
__ZCRX_CTRL_LAST,
};
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index e9981478bcf6..17ce49536f41 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -8,6 +8,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff_ref.h>
+#include <linux/anon_inodes.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
@@ -586,6 +587,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
}
}
+static void zcrx_unregister(struct io_zcrx_ifq *ifq)
+{
+ if (refcount_dec_and_test(&ifq->user_refs)) {
+ io_close_queue(ifq);
+ io_zcrx_scrub(ifq);
+ }
+ io_put_zcrx_ifq(ifq);
+}
+
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
@@ -596,6 +606,46 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
return ifq ? &ifq->region : NULL;
}
+static int zcrx_box_release(struct inode *inode, struct file *file)
+{
+ struct io_zcrx_ifq *ifq = file->private_data;
+
+ zcrx_unregister(ifq);
+ return 0;
+}
+
+static const struct file_operations zcrx_box_fops = {
+ .owner = THIS_MODULE,
+ .release = zcrx_box_release,
+};
+
+static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
+ struct zcrx_ctrl *ctrl)
+{
+ struct file *file;
+ int fd = -1;
+
+ if (!mem_is_zero(&ctrl->resv, sizeof(ctrl->resv)))
+ return -EINVAL;
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ refcount_inc(&ifq->refs);
+ refcount_inc(&ifq->user_refs);
+
+ file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
+ ifq, O_CLOEXEC, NULL);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ zcrx_unregister(ifq);
+ return PTR_ERR(file);
+ }
+
+ fd_install(fd, file);
+ return fd;
+}
+
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
struct zcrx_ctrl ctrl;
@@ -612,6 +662,11 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
if (!ifq)
return -ENXIO;
+ switch (ctrl.op) {
+ case ZCRX_CTRL_EXPORT:
+ return export_zcrx(ctx, ifq, &ctrl);
+ }
+
return -EINVAL;
}
@@ -757,12 +812,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
}
if (!ifq)
break;
-
- if (refcount_dec_and_test(&ifq->user_refs)) {
- io_close_queue(ifq);
- io_zcrx_scrub(ifq);
- }
- io_put_zcrx_ifq(ifq);
+ zcrx_unregister(ifq);
}
xa_destroy(&ctx->zcrx_ctxs);
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v4 12/12] io_uring/zcrx: share an ifq between rings
2025-11-03 23:40 [PATCH v4 00/12] io_uring zcrx ifq sharing David Wei
` (10 preceding siblings ...)
2025-11-03 23:41 ` [PATCH v4 11/12] io_uring/zcrx: export zcrx via a file David Wei
@ 2025-11-03 23:41 ` David Wei
2025-11-04 13:53 ` Pavel Begunkov
11 siblings, 1 reply; 16+ messages in thread
From: David Wei @ 2025-11-03 23:41 UTC (permalink / raw)
To: io-uring, netdev; +Cc: Jens Axboe, Pavel Begunkov
Add a way to share an ifq from a src ring that is real (i.e. bound to a
HW RX queue) with other rings. This is done by passing a new flag
IORING_ZCRX_IFQ_REG_IMPORT in the registration struct
io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq.
Signed-off-by: David Wei <dw@davidwei.uk>
---
include/uapi/linux/io_uring.h | 4 +++
io_uring/zcrx.c | 63 +++++++++++++++++++++++++++++++++--
2 files changed, 65 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 34bd32402902..0ead7f6b2094 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg {
__u64 __resv2[2];
};
+enum io_uring_zcrx_ifq_reg_flags {
+ IORING_ZCRX_IFQ_REG_IMPORT = 1,
+};
+
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 17ce49536f41..5a0af9dd6a8e 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -625,6 +625,11 @@ static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
struct file *file;
int fd = -1;
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ return -EINVAL;
+ if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
+ return -EINVAL;
+
if (!mem_is_zero(&ctrl->resv, sizeof(ctrl->resv)))
return -EINVAL;
fd = get_unused_fd_flags(O_CLOEXEC);
@@ -646,6 +651,58 @@ static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
return fd;
}
+static int import_zcrx(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg,
+ struct io_uring_zcrx_ifq_reg *reg)
+{
+ struct io_zcrx_ifq *ifq;
+ struct file *file;
+ int fd, ret;
+ u32 id;
+
+ if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
+ return -EINVAL;
+
+ fd = reg->if_idx;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ file = fd_file(f);
+ if (file->f_op != &zcrx_box_fops || !file->private_data)
+ return -EBADF;
+
+ ifq = file->private_data;
+ refcount_inc(&ifq->refs);
+ refcount_inc(&ifq->user_refs);
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+ if (ret)
+ goto err;
+ }
+
+ reg->zcrx_id = id;
+ if (copy_to_user(arg, reg, sizeof(*reg))) {
+ ret = -EFAULT;
+ goto err_xa_erase;
+ }
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ret = -ENOMEM;
+ if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+ goto err_xa_erase;
+ }
+
+ return 0;
+err_xa_erase:
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->zcrx_ctxs, id);
+err:
+ zcrx_unregister(ifq);
+ return ret;
+}
+
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
struct zcrx_ctrl ctrl;
@@ -695,11 +752,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EINVAL;
if (copy_from_user(®, arg, sizeof(reg)))
return -EFAULT;
- if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
- return -EFAULT;
if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) ||
reg.__resv2 || reg.zcrx_id)
return -EINVAL;
+ if (reg.flags & IORING_ZCRX_IFQ_REG_IMPORT)
+ return import_zcrx(ctx, arg, ®);
+ if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
+ return -EFAULT;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
return -EINVAL;
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
--
2.47.3
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v4 12/12] io_uring/zcrx: share an ifq between rings
2025-11-03 23:41 ` [PATCH v4 12/12] io_uring/zcrx: share an ifq between rings David Wei
@ 2025-11-04 13:53 ` Pavel Begunkov
0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-11-04 13:53 UTC (permalink / raw)
To: David Wei, io-uring, netdev; +Cc: Jens Axboe
On 11/3/25 23:41, David Wei wrote:
> Add a way to share an ifq from a src ring that is real (i.e. bound to a
> HW RX queue) with other rings. This is done by passing a new flag
> IORING_ZCRX_IFQ_REG_IMPORT in the registration struct
> io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq.
>
> Signed-off-by: David Wei <dw@davidwei.uk>
> ---
> include/uapi/linux/io_uring.h | 4 +++
> io_uring/zcrx.c | 63 +++++++++++++++++++++++++++++++++--
> 2 files changed, 65 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 34bd32402902..0ead7f6b2094 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg {
> __u64 __resv2[2];
> };
>
> +enum io_uring_zcrx_ifq_reg_flags {
Maybe just zcrx_reg_flags? "io_uring" prefix we used before makes
things too long and quite unhandy. And "ifq" is dropped as it's
not great long term assuming one ifq backing it.
> + IORING_ZCRX_IFQ_REG_IMPORT = 1,
Same
> +};
> +
> /*
> * Argument for IORING_REGISTER_ZCRX_IFQ
> */
> diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
> index 17ce49536f41..5a0af9dd6a8e 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -625,6 +625,11 @@ static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
> struct file *file;
> int fd = -1;
>
> + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
> + return -EINVAL;
> + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
> + return -EINVAL;
This chunk should be in the import path.
> +
> if (!mem_is_zero(&ctrl->resv, sizeof(ctrl->resv)))
> return -EINVAL;
> fd = get_unused_fd_flags(O_CLOEXEC);
> @@ -646,6 +651,58 @@ static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
> return fd;
> }
>
> +static int import_zcrx(struct io_ring_ctx *ctx,
> + struct io_uring_zcrx_ifq_reg __user *arg,
> + struct io_uring_zcrx_ifq_reg *reg)
> +{
> + struct io_zcrx_ifq *ifq;
> + struct file *file;
> + int fd, ret;
> + u32 id;
> +
> + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
> + return -EINVAL;
> +
> + fd = reg->if_idx;
> + CLASS(fd, f)(fd);
> + if (fd_empty(f))
> + return -EBADF;
> +
> + file = fd_file(f);
> + if (file->f_op != &zcrx_box_fops || !file->private_data)
> + return -EBADF;
> +
> + ifq = file->private_data;
> + refcount_inc(&ifq->refs);
> + refcount_inc(&ifq->user_refs);
It'd be a good idea to fill in basic info about zcrx
it usually returns from registration. E.g. offsets.
> + scoped_guard(mutex, &ctx->mmap_lock) {
> + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
> + if (ret)
> + goto err;
> + }
> +
> + reg->zcrx_id = id;
> + if (copy_to_user(arg, reg, sizeof(*reg))) {
> + ret = -EFAULT;
> + goto err_xa_erase;
> + }
> +
> + scoped_guard(mutex, &ctx->mmap_lock) {
> + ret = -ENOMEM;
> + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
> + goto err_xa_erase;
> + }
> +
> + return 0;
> +err_xa_erase:
> + scoped_guard(mutex, &ctx->mmap_lock)
> + xa_erase(&ctx->zcrx_ctxs, id);
> +err:
> + zcrx_unregister(ifq);
> + return ret;
> +}
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 16+ messages in thread