* [PATCH v3 1/8] io_uring/kbuf: add support for kernel-managed buffer rings
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 2/8] io_uring/kbuf: support kernel-managed buffer rings in buffer selection Joanne Koong
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
Add support for kernel-managed buffer rings (kmbuf rings), which allow
the kernel to allocate and manage the backing buffers for a buffer
ring, rather than requiring the application to provide and manage them.
Internally, the IOBL_KERNEL_MANAGED flag marks buffer lists as
kernel-managed for appropriate handling in the I/O path.
At the uapi level, kernel-managed buffer rings are created through the
pbuf interface with the IOU_PBUF_RING_KERNEL_MANAGED flag set. The
io_uring_buf_reg struct is modified to allow taking in a buf_size
instead of a ring_addr. To create a kernel-managed buffer ring, the
caller must set the IOU_PBUF_RING_MMAP flag as well to indicate that the
kernel will allocate the memory for the ring. When the caller mmaps
the ring, they will get back a virtual mapping to the buffer memory.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/uapi/linux/io_uring.h | 16 ++++-
io_uring/kbuf.c | 97 ++++++++++++++++++++++++-----
io_uring/kbuf.h | 6 +-
io_uring/memmap.c | 111 ++++++++++++++++++++++++++++++++++
io_uring/memmap.h | 4 ++
5 files changed, 215 insertions(+), 19 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 23eaeb1fc8d9..81dddf0ba0eb 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -892,15 +892,29 @@ struct io_uring_buf_ring {
* use of it will consume only as much as it needs. This
* requires that both the kernel and application keep
* track of where the current read/recv index is at.
+ * IOU_PBUF_RING_KERNEL_MANAGED: If set, kernel allocates and manages the memory
+ * for the ring and its buffers. The application must set
+ * the buffer size through reg->buf_size and the size must
+ * be page-aligned. When the application subsequently calls
+ * mmap(2) with
+ * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT),
+ * the virtual mapping returned is a contiguous mapping of
+ * the buffers. If set, IOU_PBUF_RING_MMAP must be set as
+ * well.
*/
enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1,
IOU_PBUF_RING_INC = 2,
+ IOU_PBUF_RING_KERNEL_MANAGED = 4,
};
/* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg {
- __u64 ring_addr;
+ union {
+ __u64 ring_addr;
+ /* used if reg->flags & IOU_PBUF_RING_KERNEL_MANAGED */
+ __u32 buf_size;
+ };
__u32 ring_entries;
__u16 bgid;
__u16 flags;
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 2ffa95b1c601..0e42c8f602e1 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -427,10 +427,13 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
- if (bl->flags & IOBL_BUF_RING)
+ if (bl->flags & IOBL_BUF_RING) {
io_free_region(ctx->user, &bl->region);
- else
+ if (bl->flags & IOBL_KERNEL_MANAGED)
+ kfree(bl->buf_ring);
+ } else {
io_remove_buffers_legacy(ctx, bl, -1U);
+ }
kfree(bl);
}
@@ -596,14 +599,53 @@ int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
return IOU_COMPLETE;
}
+static int io_setup_kmbuf_ring(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl,
+ const struct io_uring_buf_reg *reg)
+{
+ struct io_uring_buf_ring *ring;
+ unsigned long ring_size;
+ void *buf_region;
+ unsigned int i;
+ int ret;
+
+ /* allocate pages for the ring structure */
+ ring_size = flex_array_size(ring, bufs, reg->ring_entries);
+ ring = kzalloc(ring_size, GFP_KERNEL_ACCOUNT);
+ if (!ring)
+ return -ENOMEM;
+
+ ret = io_create_region_multi_buf(ctx, &bl->region, reg->ring_entries,
+ reg->buf_size);
+ if (ret) {
+ kfree(ring);
+ return ret;
+ }
+
+ /* initialize ring buf entries to point to the buffers */
+ buf_region = bl->region.ptr;
+ for (i = 0; i < reg->ring_entries; i++) {
+ struct io_uring_buf *buf = &ring->bufs[i];
+
+ buf->addr = (u64)(uintptr_t)buf_region;
+ buf->len = reg->buf_size;
+ buf->bid = i;
+
+ buf_region += reg->buf_size;
+ }
+ ring->tail = reg->ring_entries;
+
+ bl->buf_ring = ring;
+ bl->flags |= IOBL_KERNEL_MANAGED;
+
+ return 0;
+}
+
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
- struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
- unsigned long mmap_offset;
- unsigned long ring_size;
int ret;
lockdep_assert_held(&ctx->uring_lock);
@@ -612,7 +654,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -EFAULT;
if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
return -EINVAL;
- if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
+ if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC |
+ IOU_PBUF_RING_KERNEL_MANAGED))
return -EINVAL;
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
@@ -620,6 +663,16 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.ring_entries >= 65536)
return -EINVAL;
+ if (reg.flags & IOU_PBUF_RING_KERNEL_MANAGED) {
+ if (!(reg.flags & IOU_PBUF_RING_MMAP))
+ return -EINVAL;
+ /* not yet supported */
+ if (reg.flags & IOU_PBUF_RING_INC)
+ return -EINVAL;
+ if (!reg.buf_size || !PAGE_ALIGNED(reg.buf_size))
+ return -EINVAL;
+ }
+
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
@@ -632,19 +685,30 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!bl)
return -ENOMEM;
- mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
- ring_size = flex_array_size(br, bufs, reg.ring_entries);
+ if (!(reg.flags & IOU_PBUF_RING_KERNEL_MANAGED)) {
+ struct io_uring_region_desc rd;
+ unsigned long mmap_offset;
+ unsigned long ring_size;
+
+ mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
+ ring_size = flex_array_size(br, bufs, reg.ring_entries);
- memset(&rd, 0, sizeof(rd));
- rd.size = PAGE_ALIGN(ring_size);
- if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
- rd.user_addr = reg.ring_addr;
- rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(ring_size);
+ if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+ rd.user_addr = reg.ring_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &bl->region, &rd, mmap_offset);
+ if (!ret)
+ bl->buf_ring = io_region_get_ptr(&bl->region);
+ } else {
+ ret = io_setup_kmbuf_ring(ctx, bl, ®);
}
- ret = io_create_region(ctx, &bl->region, &rd, mmap_offset);
if (ret)
goto fail;
- br = io_region_get_ptr(&bl->region);
+
+ br = bl->buf_ring;
#ifdef SHM_COLOUR
/*
@@ -666,7 +730,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
bl->flags |= IOBL_BUF_RING;
- bl->buf_ring = br;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
ret = io_buffer_add_list(ctx, bl, reg.bgid);
@@ -674,6 +737,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
fail:
io_free_region(ctx->user, &bl->region);
+ if (bl->flags & IOBL_KERNEL_MANAGED)
+ kfree(bl->buf_ring);
kfree(bl);
return ret;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index bf15e26520d3..38dd5fe6716e 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -7,9 +7,11 @@
enum {
/* ring mapped provided buffers */
- IOBL_BUF_RING = 1,
+ IOBL_BUF_RING = 1,
/* buffers are consumed incrementally rather than always fully */
- IOBL_INC = 2,
+ IOBL_INC = 2,
+ /* buffers are kernel managed */
+ IOBL_KERNEL_MANAGED = 4,
};
struct io_buffer_list {
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index e6958968975a..4979cbbfa27c 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -15,6 +15,28 @@
#include "rsrc.h"
#include "zcrx.h"
+static void release_multi_buf_pages(struct page **pages, unsigned long nr_pages)
+{
+ struct page *page;
+ unsigned int nr, i = 0;
+
+ while (nr_pages) {
+ page = pages[i];
+
+ if (!page || WARN_ON_ONCE(page != compound_head(page)))
+ return;
+
+ nr = compound_nr(page);
+ put_page(page);
+
+ if (WARN_ON_ONCE(nr > nr_pages))
+ return;
+
+ i += nr;
+ nr_pages -= nr;
+ }
+}
+
static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
{
@@ -86,6 +108,8 @@ enum {
IO_REGION_F_USER_PROVIDED = 2,
/* only the first page in the array is ref'ed */
IO_REGION_F_SINGLE_REF = 4,
+ /* pages in the array belong to multiple discrete allocations */
+ IO_REGION_F_MULTI_BUF = 8,
};
void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
@@ -98,6 +122,8 @@ void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
if (mr->flags & IO_REGION_F_USER_PROVIDED)
unpin_user_pages(mr->pages, nr_refs);
+ else if (mr->flags & IO_REGION_F_MULTI_BUF)
+ release_multi_buf_pages(mr->pages, nr_refs);
else
release_pages(mr->pages, nr_refs);
@@ -149,6 +175,54 @@ static int io_region_pin_pages(struct io_mapped_region *mr,
return 0;
}
+static int io_region_allocate_pages_multi_buf(struct io_mapped_region *mr,
+ unsigned int nr_bufs,
+ unsigned int buf_size)
+{
+ gfp_t gfp = GFP_USER | __GFP_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ struct page **pages, **cur_pages;
+ unsigned int nr_allocated;
+ unsigned int buf_pages;
+ unsigned int i;
+
+ if (!PAGE_ALIGNED(buf_size))
+ return -EINVAL;
+
+ buf_pages = buf_size >> PAGE_SHIFT;
+
+ pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
+ if (!pages)
+ return -ENOMEM;
+
+ cur_pages = pages;
+
+ for (i = 0; i < nr_bufs; i++) {
+ if (io_mem_alloc_compound(cur_pages, buf_pages, buf_size,
+ gfp)) {
+ cur_pages += buf_pages;
+ continue;
+ }
+
+ nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE,
+ buf_pages, cur_pages);
+ if (nr_allocated != buf_pages) {
+ unsigned int total =
+ (cur_pages - pages) + nr_allocated;
+
+ release_multi_buf_pages(pages, total);
+ kvfree(pages);
+ return -ENOMEM;
+ }
+
+ cur_pages += buf_pages;
+ }
+
+ mr->flags |= IO_REGION_F_MULTI_BUF;
+ mr->pages = pages;
+
+ return 0;
+}
+
static int io_region_allocate_pages(struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
@@ -181,6 +255,43 @@ static int io_region_allocate_pages(struct io_mapped_region *mr,
return 0;
}
+int io_create_region_multi_buf(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ unsigned int nr_bufs, unsigned int buf_size)
+{
+ unsigned int nr_pages;
+ int ret;
+
+ if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
+ return -EFAULT;
+
+ if (WARN_ON_ONCE(!nr_bufs || !buf_size || !PAGE_ALIGNED(buf_size)))
+ return -EINVAL;
+
+ if (check_mul_overflow(buf_size >> PAGE_SHIFT, nr_bufs, &nr_pages))
+ return -EINVAL;
+
+ if (ctx->user) {
+ ret = __io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ return ret;
+ }
+ mr->nr_pages = nr_pages;
+
+ ret = io_region_allocate_pages_multi_buf(mr, nr_bufs, buf_size);
+ if (ret)
+ goto out_free;
+
+ ret = io_region_init_ptr(mr);
+ if (ret)
+ goto out_free;
+
+ return 0;
+out_free:
+ io_free_region(ctx->user, mr);
+ return ret;
+}
+
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index f4cfbb6b9a1f..3aa1167462ae 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -22,6 +22,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset);
+int io_create_region_multi_buf(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ unsigned int nr_bufs, unsigned int buf_size);
+
static inline void *io_region_get_ptr(struct io_mapped_region *mr)
{
return mr->ptr;
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 2/8] io_uring/kbuf: support kernel-managed buffer rings in buffer selection
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
2026-03-06 0:32 ` [PATCH v3 1/8] io_uring/kbuf: add support for " Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 3/8] io_uring/kbuf: add buffer ring pinning/unpinning Joanne Koong
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
Allow kernel-managed buffers to be selected. This requires modifying the
io_br_sel struct to separate the fields for address and val, since a
kernel address cannot be distinguished from a negative val when error
checking.
Auto-commit any selected kernel-managed buffer.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring_types.h | 8 ++++----
io_uring/kbuf.c | 16 ++++++++++++----
2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3e4a82a6f817..36cc2e0346d9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -93,13 +93,13 @@ struct io_mapped_region {
*/
struct io_br_sel {
struct io_buffer_list *buf_list;
- /*
- * Some selection parts return the user address, others return an error.
- */
union {
+ /* for classic/ring provided buffers */
void __user *addr;
- ssize_t val;
+ /* for kernel-managed buffers */
+ void *kaddr;
};
+ ssize_t val;
};
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 0e42c8f602e1..13b80c667881 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -155,7 +155,8 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
return 1;
}
-static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags)
+static bool io_should_commit(struct io_kiocb *req, struct io_buffer_list *bl,
+ unsigned int issue_flags)
{
/*
* If we came in unlocked, we have no choice but to consume the
@@ -170,7 +171,11 @@ static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags)
if (issue_flags & IO_URING_F_UNLOCKED)
return true;
- /* uring_cmd commits kbuf upfront, no need to auto-commit */
+ /* kernel-managed buffers are auto-committed */
+ if (bl->flags & IOBL_KERNEL_MANAGED)
+ return true;
+
+ /* multishot uring_cmd commits kbuf upfront, no need to auto-commit */
if (!io_file_can_poll(req) && !io_is_uring_cmd(req))
return true;
return false;
@@ -200,9 +205,12 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
req->buf_index = READ_ONCE(buf->bid);
sel.buf_list = bl;
- sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
+ if (bl->flags & IOBL_KERNEL_MANAGED)
+ sel.kaddr = (void *)(uintptr_t)READ_ONCE(buf->addr);
+ else
+ sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
- if (io_should_commit(req, issue_flags)) {
+ if (io_should_commit(req, bl, issue_flags)) {
io_kbuf_commit(req, sel.buf_list, *len, 1);
sel.buf_list = NULL;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 3/8] io_uring/kbuf: add buffer ring pinning/unpinning
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
2026-03-06 0:32 ` [PATCH v3 1/8] io_uring/kbuf: add support for " Joanne Koong
2026-03-06 0:32 ` [PATCH v3 2/8] io_uring/kbuf: support kernel-managed buffer rings in buffer selection Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 4/8] io_uring/kbuf: return buffer id in buffer selection Joanne Koong
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
Add kernel APIs to pin and unpin buffer rings, preventing userspace from
unregistering a buffer ring while it is pinned by the kernel.
This provides a mechanism for kernel subsystems to safely access buffer
ring contents while ensuring the buffer ring remains valid. A pinned
buffer ring cannot be unregistered until explicitly unpinned. On the
userspace side, trying to unregister a pinned buffer will return -EBUSY.
This is a preparatory change for upcoming fuse usage of kernel-managed
buffer rings. It is necessary for fuse to pin the buffer ring because
fuse may need to select a buffer in atomic contexts, which it can only
do so by using the underlying buffer list pointer.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 17 +++++++++++
io_uring/kbuf.c | 55 ++++++++++++++++++++++++++++++++++++
io_uring/kbuf.h | 5 ++++
3 files changed, 77 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 331dcbefe72f..7ce36e143285 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -91,6 +91,10 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
struct io_br_sel *sel, unsigned int issue_flags);
+int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, unsigned buf_group,
+ unsigned issue_flags, struct io_buffer_list **out_bl);
+int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group,
+ unsigned issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -133,6 +137,19 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
{
return true;
}
+static inline int io_uring_buf_ring_pin(struct io_uring_cmd *cmd,
+ unsigned buf_group,
+ unsigned issue_flags,
+ struct io_buffer_list **bl)
+{
+ return -EOPNOTSUPP;
+}
+static inline int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd,
+ unsigned buf_group,
+ unsigned issue_flags)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 13b80c667881..cb2d3bbdca67 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -9,6 +9,7 @@
#include <linux/poll.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
#include <uapi/linux/io_uring.h>
@@ -237,6 +238,58 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
return sel;
}
+int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, unsigned buf_group,
+ unsigned issue_flags, struct io_buffer_list **out_bl)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct io_buffer_list *bl;
+ int ret = -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, buf_group);
+ if (!bl || !(bl->flags & IOBL_BUF_RING))
+ goto err;
+
+ if (unlikely(bl->flags & IOBL_PINNED)) {
+ ret = -EALREADY;
+ goto err;
+ }
+
+ bl->flags |= IOBL_PINNED;
+ ret = 0;
+ *out_bl = bl;
+err:
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(io_uring_buf_ring_pin);
+
+int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group,
+ unsigned issue_flags)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct io_buffer_list *bl;
+ unsigned int required_flags;
+ int ret = -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, buf_group);
+ if (!bl)
+ goto err;
+
+ required_flags = IOBL_BUF_RING | IOBL_PINNED;
+ if ((bl->flags & required_flags) == required_flags) {
+ bl->flags &= ~IOBL_PINNED;
+ ret = 0;
+ }
+err:
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(io_uring_buf_ring_unpin);
+
/* cap it at a reasonable 256, will be one page even for 4K */
#define PEEK_MAX_IMPORT 256
@@ -768,6 +821,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOENT;
if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
+ if (bl->flags & IOBL_PINNED)
+ return -EBUSY;
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->io_bl_xa, bl->bgid);
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 38dd5fe6716e..006e8a73a117 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -12,6 +12,11 @@ enum {
IOBL_INC = 2,
/* buffers are kernel managed */
IOBL_KERNEL_MANAGED = 4,
+ /*
+ * buffer ring is pinned and cannot be unregistered by userspace until
+ * it has been unpinned
+ */
+ IOBL_PINNED = 8,
};
struct io_buffer_list {
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 4/8] io_uring/kbuf: return buffer id in buffer selection
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
` (2 preceding siblings ...)
2026-03-06 0:32 ` [PATCH v3 3/8] io_uring/kbuf: add buffer ring pinning/unpinning Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 5/8] io_uring/kbuf: add recycling for kernel managed buffer rings Joanne Koong
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
Return the id of the selected buffer in io_buffer_select(). This is
needed for kernel-managed buffer rings to later recycle the selected
buffer.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 2 +-
include/linux/io_uring_types.h | 2 ++
io_uring/kbuf.c | 7 +++++--
3 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 7ce36e143285..505a5b13e57c 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -78,7 +78,7 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
/*
* Select a buffer from the provided buffer group for multishot uring_cmd.
- * Returns the selected buffer address and size.
+ * Returns the selected buffer address, size, and id.
*/
struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
unsigned buf_group, size_t *len,
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 36cc2e0346d9..5a56bb341337 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -100,6 +100,8 @@ struct io_br_sel {
void *kaddr;
};
ssize_t val;
+ /* id of the selected buffer */
+ unsigned buf_id;
};
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index cb2d3bbdca67..9a681241c8b3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -206,6 +206,7 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
req->buf_index = READ_ONCE(buf->bid);
sel.buf_list = bl;
+ sel.buf_id = req->buf_index;
if (bl->flags & IOBL_KERNEL_MANAGED)
sel.kaddr = (void *)(uintptr_t)READ_ONCE(buf->addr);
else
@@ -229,10 +230,12 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
- if (bl->flags & IOBL_BUF_RING)
+ if (bl->flags & IOBL_BUF_RING) {
sel = io_ring_buffer_select(req, len, bl, issue_flags);
- else
+ } else {
sel.addr = io_provided_buffer_select(req, len, bl);
+ sel.buf_id = req->buf_index;
+ }
}
io_ring_submit_unlock(req->ctx, issue_flags);
return sel;
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 5/8] io_uring/kbuf: add recycling for kernel managed buffer rings
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
` (3 preceding siblings ...)
2026-03-06 0:32 ` [PATCH v3 4/8] io_uring/kbuf: return buffer id in buffer selection Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 6/8] io_uring/kbuf: add io_uring_is_kmbuf_ring() Joanne Koong
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
Add an interface for buffers to be recycled back into a kernel-managed
buffer ring.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 11 +++++++++
io_uring/kbuf.c | 48 ++++++++++++++++++++++++++++++++++++
2 files changed, 59 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 505a5b13e57c..dabe0cd3fe38 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -95,6 +95,10 @@ int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, unsigned buf_group,
unsigned issue_flags, struct io_buffer_list **out_bl);
int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group,
unsigned issue_flags);
+
+int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group,
+ u64 addr, unsigned int len, unsigned int bid,
+ unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -150,6 +154,13 @@ static inline int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
+static inline int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd,
+ unsigned int buf_group, u64 addr,
+ unsigned int len, unsigned int bid,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9a681241c8b3..1497326694d0 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -102,6 +102,54 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
req->kbuf = NULL;
}
+int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group,
+ u64 addr, unsigned int len, unsigned int bid,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_buf_ring *br;
+ struct io_uring_buf *buf;
+ struct io_buffer_list *bl;
+ unsigned int required_flags;
+ int ret = -EINVAL;
+
+ if (WARN_ON_ONCE(req->flags & REQ_F_BUFFERS_COMMIT))
+ return ret;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, buf_group);
+
+ if (!bl)
+ goto err;
+
+ required_flags = IOBL_BUF_RING | IOBL_KERNEL_MANAGED;
+ if (WARN_ON_ONCE((bl->flags & required_flags) != required_flags))
+ goto err;
+
+ br = bl->buf_ring;
+
+ if (WARN_ON_ONCE((__u16)(br->tail - bl->head) >= bl->nr_entries))
+ goto err;
+
+ buf = &br->bufs[(br->tail) & bl->mask];
+
+ buf->addr = addr;
+ buf->len = len;
+ buf->bid = bid;
+
+ req->flags &= ~REQ_F_BUFFER_RING;
+
+ br->tail++;
+ ret = 0;
+
+err:
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(io_uring_kmbuf_recycle);
+
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 6/8] io_uring/kbuf: add io_uring_is_kmbuf_ring()
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
` (4 preceding siblings ...)
2026-03-06 0:32 ` [PATCH v3 5/8] io_uring/kbuf: add recycling for kernel managed buffer rings Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 7/8] io_uring/kbuf: export io_ring_buffer_select() Joanne Koong
2026-03-06 0:32 ` [PATCH v3 8/8] io_uring/cmd: set selected buffer index in __io_uring_cmd_done() Joanne Koong
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
io_uring_is_kmbuf_ring() returns true if there is a kernel-managed
buffer ring at the specified buffer group.
This is a preparatory patch for upcoming fuse kernel-managed buffer
support, which needs to ensure the buffer ring registered by the server
is a kernel-managed buffer ring.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 9 +++++++++
io_uring/kbuf.c | 20 ++++++++++++++++++++
2 files changed, 29 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index dabe0cd3fe38..b258671099ec 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -99,6 +99,9 @@ int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group,
int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group,
u64 addr, unsigned int len, unsigned int bid,
unsigned int issue_flags);
+
+bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, unsigned int buf_group,
+ unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -161,6 +164,12 @@ static inline int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
+static inline bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd,
+ unsigned int buf_group,
+ unsigned int issue_flags)
+{
+ return false;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 1497326694d0..ef9be071ae4e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -917,3 +917,23 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
return NULL;
return &bl->region;
}
+
+bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, unsigned int buf_group,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct io_buffer_list *bl;
+ bool is_kmbuf_ring = false;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, buf_group);
+ if (likely(bl) && (bl->flags & IOBL_KERNEL_MANAGED)) {
+ WARN_ON_ONCE(!(bl->flags & IOBL_BUF_RING));
+ is_kmbuf_ring = true;
+ }
+
+ io_ring_submit_unlock(ctx, issue_flags);
+ return is_kmbuf_ring;
+}
+EXPORT_SYMBOL_GPL(io_uring_is_kmbuf_ring);
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 7/8] io_uring/kbuf: export io_ring_buffer_select()
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
` (5 preceding siblings ...)
2026-03-06 0:32 ` [PATCH v3 6/8] io_uring/kbuf: add io_uring_is_kmbuf_ring() Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
2026-03-06 0:32 ` [PATCH v3 8/8] io_uring/cmd: set selected buffer index in __io_uring_cmd_done() Joanne Koong
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
Export io_ring_buffer_select() so that it may be used by callers who
pass in a pinned bufring without needing to grab the io_uring mutex.
This is a preparatory patch that will be needed by fuse io-uring, which
will need to select a buffer from a kernel-managed bufring while the
uring mutex may already be held by in-progress commits, and may need to
select a buffer in atomic contexts.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 14 ++++++++++++++
io_uring/kbuf.c | 7 ++++---
2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index b258671099ec..89e1a80d9f5f 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -102,6 +102,10 @@ int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group,
bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, unsigned int buf_group,
unsigned int issue_flags);
+
+struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -170,6 +174,16 @@ static inline bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd,
{
return false;
}
+static inline struct io_br_sel io_ring_buffer_select(struct io_kiocb *req,
+ size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
+{
+ struct io_br_sel sel = {
+ .val = -EOPNOTSUPP,
+ };
+ return sel;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index ef9be071ae4e..6b5f033ad8bb 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -230,9 +230,9 @@ static bool io_should_commit(struct io_kiocb *req, struct io_buffer_list *bl,
return false;
}
-static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
- struct io_buffer_list *bl,
- unsigned int issue_flags)
+struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
__u16 tail, head = bl->head;
@@ -266,6 +266,7 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
}
return sel;
}
+EXPORT_SYMBOL_GPL(io_ring_buffer_select);
struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned buf_group, unsigned int issue_flags)
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH v3 8/8] io_uring/cmd: set selected buffer index in __io_uring_cmd_done()
2026-03-06 0:32 [PATCH v3 0/8] io_uring: add kernel-managed buffer rings Joanne Koong
` (6 preceding siblings ...)
2026-03-06 0:32 ` [PATCH v3 7/8] io_uring/kbuf: export io_ring_buffer_select() Joanne Koong
@ 2026-03-06 0:32 ` Joanne Koong
7 siblings, 0 replies; 9+ messages in thread
From: Joanne Koong @ 2026-03-06 0:32 UTC (permalink / raw)
To: axboe; +Cc: hch, asml.silence, bernd, csander, krisman, linux-fsdevel,
io-uring
When uring_cmd operations select a buffer, the completion queue entry
should indicate which buffer was selected.
Set IORING_CQE_F_BUFFER on the completed entry and encode the buffer
index if a buffer was selected.
This change is needed in order to relay to userspace which selected
buffer contains the data.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
io_uring/uring_cmd.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ee7b49f47cb5..6d38df1a812d 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -151,6 +151,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
unsigned issue_flags, bool is_cqe32)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ u32 cflags = 0;
if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
return;
@@ -160,7 +161,10 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
if (ret < 0)
req_set_fail(req);
- io_req_set_res(req, ret, 0);
+ if (req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_BUFFER_RING))
+ cflags |= IORING_CQE_F_BUFFER |
+ (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+ io_req_set_res(req, ret, cflags);
if (is_cqe32) {
if (req->ctx->flags & IORING_SETUP_CQE_MIXED)
req->cqe.flags |= IORING_CQE_F_32;
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread