* [PATCH v2 01/25] io_uring/kbuf: refactor io_buf_pbuf_register() logic into generic helpers
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
@ 2025-12-18 8:32 ` Joanne Koong
2025-12-18 8:32 ` [PATCH v2 02/25] io_uring/kbuf: rename io_unregister_pbuf_ring() to io_unregister_buf_ring() Joanne Koong
` (23 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:32 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Refactor the logic in io_register_pbuf_ring() into generic helpers:
- io_validate_buf_reg(): Validate user input and buffer registration
parameters
- io_alloc_new_buffer_list(): Allocate and initialize a new buffer
list for the given buffer group ID
- io_setup_pbuf_ring(): Sets up the physical buffer ring region and
handles memory mapping for provided buffer rings
This is a preparatory change for upcoming kernel-managed buffer ring
support which will need to reuse some of these helpers.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
io_uring/kbuf.c | 123 ++++++++++++++++++++++++++++++++----------------
1 file changed, 82 insertions(+), 41 deletions(-)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 796d131107dd..100367bb510b 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -596,55 +596,71 @@ int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
return IOU_COMPLETE;
}
-int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+static int io_validate_buf_reg(struct io_uring_buf_reg *reg,
+ unsigned int permitted_flags)
{
- struct io_uring_buf_reg reg;
- struct io_buffer_list *bl;
- struct io_uring_region_desc rd;
- struct io_uring_buf_ring *br;
- unsigned long mmap_offset;
- unsigned long ring_size;
- int ret;
-
- lockdep_assert_held(&ctx->uring_lock);
-
- if (copy_from_user(®, arg, sizeof(reg)))
- return -EFAULT;
- if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
+ if (!mem_is_zero(reg->resv, sizeof(reg->resv)))
return -EINVAL;
- if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
+ if (reg->flags & ~permitted_flags)
return -EINVAL;
- if (!is_power_of_2(reg.ring_entries))
+ if (!is_power_of_2(reg->ring_entries))
return -EINVAL;
/* cannot disambiguate full vs empty due to head/tail size */
- if (reg.ring_entries >= 65536)
+ if (reg->ring_entries >= 65536)
return -EINVAL;
+ return 0;
+}
- bl = io_buffer_get_list(ctx, reg.bgid);
- if (bl) {
+static int io_alloc_new_buffer_list(struct io_ring_ctx *ctx,
+ struct io_uring_buf_reg *reg,
+ struct io_buffer_list **bl)
+{
+ struct io_buffer_list *list;
+
+ list = io_buffer_get_list(ctx, reg->bgid);
+ if (list) {
/* if mapped buffer ring OR classic exists, don't allow */
- if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
+ if (list->flags & IOBL_BUF_RING || !list_empty(&list->buf_list))
return -EEXIST;
- io_destroy_bl(ctx, bl);
+ io_destroy_bl(ctx, list);
}
- bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
- if (!bl)
+ list = kzalloc(sizeof(*list), GFP_KERNEL_ACCOUNT);
+ if (!list)
return -ENOMEM;
- mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
- ring_size = flex_array_size(br, bufs, reg.ring_entries);
+ list->nr_entries = reg->ring_entries;
+ list->mask = reg->ring_entries - 1;
+ list->flags = IOBL_BUF_RING;
+
+ *bl = list;
+
+ return 0;
+}
+
+static int io_setup_pbuf_ring(struct io_ring_ctx *ctx,
+ struct io_uring_buf_reg *reg,
+ struct io_buffer_list *bl)
+{
+ struct io_uring_region_desc rd;
+ unsigned long mmap_offset;
+ unsigned long ring_size;
+ int ret;
+
+ mmap_offset = (unsigned long)reg->bgid << IORING_OFF_PBUF_SHIFT;
+ ring_size = flex_array_size(bl->buf_ring, bufs, reg->ring_entries);
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(ring_size);
- if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
- rd.user_addr = reg.ring_addr;
+ if (!(reg->flags & IOU_PBUF_RING_MMAP)) {
+ rd.user_addr = reg->ring_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
+
ret = io_create_region(ctx, &bl->region, &rd, mmap_offset);
if (ret)
- goto fail;
- br = io_region_get_ptr(&bl->region);
+ return ret;
+ bl->buf_ring = io_region_get_ptr(&bl->region);
#ifdef SHM_COLOUR
/*
@@ -656,25 +672,50 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
- if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
- ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
- ret = -EINVAL;
- goto fail;
+ if (!(reg->flags & IOU_PBUF_RING_MMAP) &&
+ ((reg->ring_addr | (unsigned long)bl->buf_ring) &
+ (SHM_COLOUR - 1))) {
+ io_free_region(ctx->user, &bl->region);
+ return -EINVAL;
}
#endif
- bl->nr_entries = reg.ring_entries;
- bl->mask = reg.ring_entries - 1;
- bl->flags |= IOBL_BUF_RING;
- bl->buf_ring = br;
+ return 0;
+}
+
+int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ unsigned int permitted_flags;
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+ int ret;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ if (copy_from_user(®, arg, sizeof(reg)))
+ return -EFAULT;
+
+ permitted_flags = IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC;
+ ret = io_validate_buf_reg(®, permitted_flags);
+ if (ret)
+ return ret;
+
+ ret = io_alloc_new_buffer_list(ctx, ®, &bl);
+ if (ret)
+ return ret;
+
+ ret = io_setup_pbuf_ring(ctx, ®, bl);
+ if (ret) {
+ kfree(bl);
+ return ret;
+ }
+
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
+
io_buffer_add_list(ctx, bl, reg.bgid);
+
return 0;
-fail:
- io_free_region(ctx->user, &bl->region);
- kfree(bl);
- return ret;
}
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 02/25] io_uring/kbuf: rename io_unregister_pbuf_ring() to io_unregister_buf_ring()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
2025-12-18 8:32 ` [PATCH v2 01/25] io_uring/kbuf: refactor io_buf_pbuf_register() logic into generic helpers Joanne Koong
@ 2025-12-18 8:32 ` Joanne Koong
2025-12-18 8:32 ` [PATCH v2 03/25] io_uring/kbuf: add support for kernel-managed buffer rings Joanne Koong
` (22 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:32 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Use the more generic name io_unregister_buf_ring() as this function will
be used for unregistering both provided buffer rings and kernel-managed
buffer rings.
This is a preparatory change for upcoming kernel-managed buffer ring
support.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
io_uring/kbuf.c | 2 +-
io_uring/kbuf.h | 2 +-
io_uring/register.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 100367bb510b..cbe477db7b86 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -718,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
-int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index bf15e26520d3..40b44f4fdb15 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -74,7 +74,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
-int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
diff --git a/io_uring/register.c b/io_uring/register.c
index 62d39b3ff317..4c6879698844 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -750,7 +750,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
- ret = io_unregister_pbuf_ring(ctx, arg);
+ ret = io_unregister_buf_ring(ctx, arg);
break;
case IORING_REGISTER_SYNC_CANCEL:
ret = -EINVAL;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 03/25] io_uring/kbuf: add support for kernel-managed buffer rings
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
2025-12-18 8:32 ` [PATCH v2 01/25] io_uring/kbuf: refactor io_buf_pbuf_register() logic into generic helpers Joanne Koong
2025-12-18 8:32 ` [PATCH v2 02/25] io_uring/kbuf: rename io_unregister_pbuf_ring() to io_unregister_buf_ring() Joanne Koong
@ 2025-12-18 8:32 ` Joanne Koong
2025-12-21 12:24 ` kernel test robot
2025-12-18 8:32 ` [PATCH v2 04/25] io_uring/kbuf: add mmap " Joanne Koong
` (21 subsequent siblings)
24 siblings, 1 reply; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:32 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add support for kernel-managed buffer rings (kmbuf rings), which allow
the kernel to allocate and manage the backing buffers for a buffer
ring, rather than requiring the application to provide and manage them.
This introduces two new registration opcodes:
- IORING_REGISTER_KMBUF_RING: Register a kernel-managed buffer ring
- IORING_UNREGISTER_KMBUF_RING: Unregister a kernel-managed buffer ring
The existing io_uring_buf_reg structure is extended with a union to
support both application-provided buffer rings (pbuf) and kernel-managed
buffer rings (kmbuf):
- For pbuf rings: ring_addr specifies the user-provided ring address
- For kmbuf rings: buf_size specifies the size of each buffer. buf_size
must be non-zero and page-aligned.
The implementation follows the same pattern as pbuf ring registration,
reusing the validation and buffer list allocation helpers introduced in
earlier refactoring. The IOBL_KERNEL_MANAGED flag marks buffer lists as
kernel-managed for appropriate handling in the I/O path.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/uapi/linux/io_uring.h | 15 ++++-
io_uring/kbuf.c | 76 +++++++++++++++++++++++
io_uring/kbuf.h | 7 ++-
io_uring/memmap.c | 112 ++++++++++++++++++++++++++++++++++
io_uring/memmap.h | 4 ++
io_uring/register.c | 7 +++
6 files changed, 217 insertions(+), 4 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b5b23c0d5283..589755a4e2b4 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -700,6 +700,10 @@ enum io_uring_register_op {
/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
IORING_REGISTER_ZCRX_CTRL = 36,
+ /* register/unregister kernel-managed ring buffer group */
+ IORING_REGISTER_KMBUF_RING = 37,
+ IORING_UNREGISTER_KMBUF_RING = 38,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -869,9 +873,16 @@ enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_INC = 2,
};
-/* argument for IORING_(UN)REGISTER_PBUF_RING */
+/* argument for IORING_(UN)REGISTER_PBUF_RING and
+ * IORING_(UN)REGISTER_KMBUF_RING
+ */
struct io_uring_buf_reg {
- __u64 ring_addr;
+ union {
+ /* used for pbuf rings */
+ __u64 ring_addr;
+ /* used for kmbuf rings */
+ __u32 buf_size;
+ };
__u32 ring_entries;
__u16 bgid;
__u16 flags;
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index cbe477db7b86..9dff21783f68 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -778,3 +778,79 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
return NULL;
return &bl->region;
}
+
+static int io_setup_kmbuf_ring(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl,
+ struct io_uring_buf_reg *reg)
+{
+ struct io_uring_buf_ring *ring;
+ unsigned long ring_size;
+ void *buf_region;
+ unsigned int i;
+ int ret;
+
+ /* allocate pages for the ring structure */
+ ring_size = flex_array_size(ring, bufs, bl->nr_entries);
+ ring = kzalloc(ring_size, GFP_KERNEL_ACCOUNT);
+ if (!ring)
+ return -ENOMEM;
+
+ ret = io_create_region_multi_buf(ctx, &bl->region, bl->nr_entries,
+ reg->buf_size);
+ if (ret) {
+ kfree(ring);
+ return ret;
+ }
+
+ /* initialize ring buf entries to point to the buffers */
+ buf_region = bl->region.ptr;
+ for (i = 0; i < bl->nr_entries; i++) {
+ struct io_uring_buf *buf = &ring->bufs[i];
+
+ buf->addr = (u64)buf_region;
+ buf->len = reg->buf_size;
+ buf->bid = i;
+
+ buf_region += reg->buf_size;
+ }
+ ring->tail = bl->nr_entries;
+
+ bl->buf_ring = ring;
+
+ return 0;
+}
+
+int io_register_kmbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+ int ret;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ if (copy_from_user(®, arg, sizeof(reg)))
+ return -EFAULT;
+
+ ret = io_validate_buf_reg(®, 0);
+ if (ret)
+ return ret;
+
+ if (!reg.buf_size || !PAGE_ALIGNED(reg.buf_size))
+ return -EINVAL;
+
+ ret = io_alloc_new_buffer_list(ctx, ®, &bl);
+ if (ret)
+ return ret;
+
+ ret = io_setup_kmbuf_ring(ctx, bl, ®);
+ if (ret) {
+ kfree(bl);
+ return ret;
+ }
+
+ bl->flags |= IOBL_KERNEL_MANAGED;
+
+ io_buffer_add_list(ctx, bl, reg.bgid);
+
+ return 0;
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 40b44f4fdb15..62c80a1ebf03 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -7,9 +7,11 @@
enum {
/* ring mapped provided buffers */
- IOBL_BUF_RING = 1,
+ IOBL_BUF_RING = 1,
/* buffers are consumed incrementally rather than always fully */
- IOBL_INC = 2,
+ IOBL_INC = 2,
+ /* buffers are kernel managed */
+ IOBL_KERNEL_MANAGED = 4,
};
struct io_buffer_list {
@@ -74,6 +76,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+int io_register_kmbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 18e574776ef6..4573eed3b072 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -15,6 +15,28 @@
#include "rsrc.h"
#include "zcrx.h"
+static void release_multi_buf_pages(struct page **pages, unsigned long nr_pages)
+{
+ struct page *page;
+ unsigned int nr, i = 0;
+
+ while (nr_pages) {
+ page = pages[i];
+
+ if (!page || WARN_ON_ONCE(page != compound_head(page)))
+ return;
+
+ nr = compound_nr(page);
+ put_page(page);
+
+ if (WARN_ON_ONCE(nr > nr_pages))
+ return;
+
+ i += nr;
+ nr_pages -= nr;
+ }
+}
+
static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
{
@@ -86,6 +108,8 @@ enum {
IO_REGION_F_USER_PROVIDED = 2,
/* only the first page in the array is ref'ed */
IO_REGION_F_SINGLE_REF = 4,
+ /* pages in the array belong to multiple discrete allocations */
+ IO_REGION_F_MULTI_BUF = 8,
};
void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
@@ -98,6 +122,8 @@ void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
if (mr->flags & IO_REGION_F_USER_PROVIDED)
unpin_user_pages(mr->pages, nr_refs);
+ else if (mr->flags & IO_REGION_F_MULTI_BUF)
+ release_multi_buf_pages(mr->pages, nr_refs);
else
release_pages(mr->pages, nr_refs);
@@ -149,6 +175,54 @@ static int io_region_pin_pages(struct io_mapped_region *mr,
return 0;
}
+static int io_region_allocate_pages_multi_buf(struct io_mapped_region *mr,
+ unsigned int nr_bufs,
+ unsigned int buf_size)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ struct page **pages, **cur_pages;
+ unsigned int nr_allocated;
+ unsigned int buf_pages;
+ unsigned int i;
+
+ if (!PAGE_ALIGNED(buf_size))
+ return -EINVAL;
+
+ buf_pages = buf_size >> PAGE_SHIFT;
+
+ pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
+ if (!pages)
+ return -ENOMEM;
+
+ cur_pages = pages;
+
+ for (i = 0; i < nr_bufs; i++) {
+ if (io_mem_alloc_compound(cur_pages, buf_pages, buf_size,
+ gfp)) {
+ cur_pages += buf_pages;
+ continue;
+ }
+
+ nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE,
+ buf_pages, cur_pages);
+ if (nr_allocated != buf_pages) {
+ unsigned int total =
+ (cur_pages - pages) + nr_allocated;
+
+ release_multi_buf_pages(pages, total);
+ kvfree(pages);
+ return -ENOMEM;
+ }
+
+ cur_pages += buf_pages;
+ }
+
+ mr->flags |= IO_REGION_F_MULTI_BUF;
+ mr->pages = pages;
+
+ return 0;
+}
+
static int io_region_allocate_pages(struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
@@ -181,6 +255,44 @@ static int io_region_allocate_pages(struct io_mapped_region *mr,
return 0;
}
+int io_create_region_multi_buf(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ unsigned int nr_bufs, unsigned int buf_size)
+{
+ unsigned long nr_pages;
+ int ret;
+
+ if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
+ return -EFAULT;
+
+ if (WARN_ON_ONCE(!nr_bufs || !buf_size))
+ return -EINVAL;
+
+ nr_pages = ((size_t)buf_size * nr_bufs) >> PAGE_SHIFT;
+ if (nr_pages > UINT_MAX)
+ return -E2BIG;
+
+ if (ctx->user) {
+ ret = __io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ return ret;
+ }
+ mr->nr_pages = nr_pages;
+
+ ret = io_region_allocate_pages_multi_buf(mr, nr_bufs, buf_size);
+ if (ret)
+ goto out_free;
+
+ ret = io_region_init_ptr(mr);
+ if (ret)
+ goto out_free;
+
+ return 0;
+out_free:
+ io_free_region(ctx->user, mr);
+ return ret;
+}
+
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index a39d9e518905..b09fc34d5eb9 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -21,6 +21,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset);
+int io_create_region_multi_buf(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ unsigned int nr_bufs, unsigned int buf_size);
+
static inline void *io_region_get_ptr(struct io_mapped_region *mr)
{
return mr->ptr;
diff --git a/io_uring/register.c b/io_uring/register.c
index 4c6879698844..4aabf6e44083 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -746,7 +746,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_pbuf_ring(ctx, arg);
break;
+ case IORING_REGISTER_KMBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_kmbuf_ring(ctx, arg);
+ break;
case IORING_UNREGISTER_PBUF_RING:
+ case IORING_UNREGISTER_KMBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH v2 03/25] io_uring/kbuf: add support for kernel-managed buffer rings
2025-12-18 8:32 ` [PATCH v2 03/25] io_uring/kbuf: add support for kernel-managed buffer rings Joanne Koong
@ 2025-12-21 12:24 ` kernel test robot
0 siblings, 0 replies; 33+ messages in thread
From: kernel test robot @ 2025-12-21 12:24 UTC (permalink / raw)
To: Joanne Koong, miklos, axboe
Cc: oe-kbuild-all, bschubert, asml.silence, io-uring, csander,
xiaobing.li, linux-fsdevel
Hi Joanne,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe/for-next]
[also build test WARNING on linus/master v6.19-rc1 next-20251219]
[cannot apply to mszeredi-fuse/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/io_uring-kbuf-refactor-io_buf_pbuf_register-logic-into-generic-helpers/20251218-165107
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20251218083319.3485503-4-joannelkoong%40gmail.com
patch subject: [PATCH v2 03/25] io_uring/kbuf: add support for kernel-managed buffer rings
config: nios2-allnoconfig (https://download.01.org/0day-ci/archive/20251221/202512212016.Nbc4ikuj-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251221/202512212016.Nbc4ikuj-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512212016.Nbc4ikuj-lkp@intel.com/
All warnings (new ones prefixed by >>):
io_uring/kbuf.c: In function 'io_setup_kmbuf_ring':
>> io_uring/kbuf.c:810:29: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
810 | buf->addr = (u64)buf_region;
| ^
vim +810 io_uring/kbuf.c
781
782 static int io_setup_kmbuf_ring(struct io_ring_ctx *ctx,
783 struct io_buffer_list *bl,
784 struct io_uring_buf_reg *reg)
785 {
786 struct io_uring_buf_ring *ring;
787 unsigned long ring_size;
788 void *buf_region;
789 unsigned int i;
790 int ret;
791
792 /* allocate pages for the ring structure */
793 ring_size = flex_array_size(ring, bufs, bl->nr_entries);
794 ring = kzalloc(ring_size, GFP_KERNEL_ACCOUNT);
795 if (!ring)
796 return -ENOMEM;
797
798 ret = io_create_region_multi_buf(ctx, &bl->region, bl->nr_entries,
799 reg->buf_size);
800 if (ret) {
801 kfree(ring);
802 return ret;
803 }
804
805 /* initialize ring buf entries to point to the buffers */
806 buf_region = bl->region.ptr;
807 for (i = 0; i < bl->nr_entries; i++) {
808 struct io_uring_buf *buf = &ring->bufs[i];
809
> 810 buf->addr = (u64)buf_region;
811 buf->len = reg->buf_size;
812 buf->bid = i;
813
814 buf_region += reg->buf_size;
815 }
816 ring->tail = bl->nr_entries;
817
818 bl->buf_ring = ring;
819
820 return 0;
821 }
822
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH v2 04/25] io_uring/kbuf: add mmap support for kernel-managed buffer rings
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (2 preceding siblings ...)
2025-12-18 8:32 ` [PATCH v2 03/25] io_uring/kbuf: add support for kernel-managed buffer rings Joanne Koong
@ 2025-12-18 8:32 ` Joanne Koong
2025-12-18 8:32 ` [PATCH v2 05/25] io_uring/kbuf: support kernel-managed buffer rings in buffer selection Joanne Koong
` (20 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:32 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add support for mmapping kernel-managed buffer rings (kmbuf) to
userspace, allowing applications to access the kernel-allocated buffers.
Similar to application-provided buffer rings (pbuf), kmbuf rings use the
buffer group ID encoded in the mmap offset to identify which buffer ring
to map. The implementation follows the same pattern as pbuf rings.
New mmap offset constants are introduced:
- IORING_OFF_KMBUF_RING (0x88000000): Base offset for kmbuf mappings
- IORING_OFF_KMBUF_SHIFT (16): Shift value to encode buffer group ID
The mmap offset is calculated during registration, encoding the bgid
shifted by IORING_OFF_KMBUF_SHIFT. The io_buf_get_region() helper
retrieves the appropriate region.
This allows userspace to mmap the kernel-allocated buffer region and
access the buffers directly.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/uapi/linux/io_uring.h | 2 ++
io_uring/kbuf.c | 11 +++++++++--
io_uring/kbuf.h | 5 +++--
io_uring/memmap.c | 5 ++++-
4 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 589755a4e2b4..96e936503ef6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -533,6 +533,8 @@ struct io_uring_cqe {
#define IORING_OFF_SQES 0x10000000ULL
#define IORING_OFF_PBUF_RING 0x80000000ULL
#define IORING_OFF_PBUF_SHIFT 16
+#define IORING_OFF_KMBUF_RING 0x88000000ULL
+#define IORING_OFF_KMBUF_SHIFT 16
#define IORING_OFF_MMAP_MASK 0xf8000000ULL
/*
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9dff21783f68..65102aaadd15 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -766,16 +766,23 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
-struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
- unsigned int bgid)
+struct io_mapped_region *io_buf_get_region(struct io_ring_ctx *ctx,
+ unsigned int bgid,
+ bool kernel_managed)
{
struct io_buffer_list *bl;
+ bool is_kernel_managed;
lockdep_assert_held(&ctx->mmap_lock);
bl = xa_load(&ctx->io_bl_xa, bgid);
if (!bl || !(bl->flags & IOBL_BUF_RING))
return NULL;
+
+ is_kernel_managed = !!(bl->flags & IOBL_KERNEL_MANAGED);
+ if (is_kernel_managed != kernel_managed)
+ return NULL;
+
return &bl->region;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 62c80a1ebf03..11d165888b8e 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -88,8 +88,9 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl,
bool io_kbuf_commit(struct io_kiocb *req,
struct io_buffer_list *bl, int len, int nr);
-struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
- unsigned int bgid);
+struct io_mapped_region *io_buf_get_region(struct io_ring_ctx *ctx,
+ unsigned int bgid,
+ bool kernel_managed);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req,
struct io_buffer_list *bl)
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 4573eed3b072..5b4065a8f183 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -357,7 +357,10 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
return &ctx->sq_region;
case IORING_OFF_PBUF_RING:
id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- return io_pbuf_get_region(ctx, id);
+ return io_buf_get_region(ctx, id, false);
+ case IORING_OFF_KMBUF_RING:
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_KMBUF_SHIFT;
+ return io_buf_get_region(ctx, id, true);
case IORING_MAP_OFF_PARAM_REGION:
return &ctx->param_region;
case IORING_MAP_OFF_ZCRX_REGION:
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 05/25] io_uring/kbuf: support kernel-managed buffer rings in buffer selection
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (3 preceding siblings ...)
2025-12-18 8:32 ` [PATCH v2 04/25] io_uring/kbuf: add mmap " Joanne Koong
@ 2025-12-18 8:32 ` Joanne Koong
2025-12-21 13:49 ` kernel test robot
2025-12-18 8:33 ` [PATCH v2 06/25] io_uring/kbuf: add buffer ring pinning/unpinning Joanne Koong
` (19 subsequent siblings)
24 siblings, 1 reply; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:32 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Allow kernel-managed buffers to be selected. This requires modifying the
io_br_sel struct to separate the fields for address and val, since a
kernel address cannot be distinguished from a negative val when error
checking.
Auto-commit any selected kernel-managed buffer.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring_types.h | 8 ++++----
io_uring/kbuf.c | 15 ++++++++++++---
2 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1adb0d20a0a..36fac08db636 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -93,13 +93,13 @@ struct io_mapped_region {
*/
struct io_br_sel {
struct io_buffer_list *buf_list;
- /*
- * Some selection parts return the user address, others return an error.
- */
union {
+ /* for classic/ring provided buffers */
void __user *addr;
- ssize_t val;
+ /* for kernel-managed buffers */
+ void *kaddr;
};
+ ssize_t val;
};
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 65102aaadd15..c98cecb56b8c 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -155,7 +155,8 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
return 1;
}
-static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags)
+static bool io_should_commit(struct io_kiocb *req, struct io_buffer_list *bl,
+ unsigned int issue_flags)
{
/*
* If we came in unlocked, we have no choice but to consume the
@@ -170,7 +171,11 @@ static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags)
if (issue_flags & IO_URING_F_UNLOCKED)
return true;
- /* uring_cmd commits kbuf upfront, no need to auto-commit */
+ /* kernel-managed buffers are auto-committed */
+ if (bl->flags & IOBL_KERNEL_MANAGED)
+ return true;
+
+ /* multishot uring_cmd commits kbuf upfront, no need to auto-commit */
if (!io_file_can_poll(req) && req->opcode != IORING_OP_URING_CMD)
return true;
return false;
@@ -201,8 +206,12 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->buf_index = READ_ONCE(buf->bid);
sel.buf_list = bl;
sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
+ if (bl->flags & IOBL_KERNEL_MANAGED)
+ sel.kaddr = (void *)buf->addr;
+ else
+ sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
- if (io_should_commit(req, issue_flags)) {
+ if (io_should_commit(req, bl, issue_flags)) {
io_kbuf_commit(req, sel.buf_list, *len, 1);
sel.buf_list = NULL;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH v2 05/25] io_uring/kbuf: support kernel-managed buffer rings in buffer selection
2025-12-18 8:32 ` [PATCH v2 05/25] io_uring/kbuf: support kernel-managed buffer rings in buffer selection Joanne Koong
@ 2025-12-21 13:49 ` kernel test robot
0 siblings, 0 replies; 33+ messages in thread
From: kernel test robot @ 2025-12-21 13:49 UTC (permalink / raw)
To: Joanne Koong, miklos, axboe
Cc: oe-kbuild-all, bschubert, asml.silence, io-uring, csander,
xiaobing.li, linux-fsdevel
Hi Joanne,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe/for-next]
[also build test WARNING on linus/master v6.19-rc1 next-20251219]
[cannot apply to mszeredi-fuse/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/io_uring-kbuf-refactor-io_buf_pbuf_register-logic-into-generic-helpers/20251218-165107
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20251218083319.3485503-6-joannelkoong%40gmail.com
patch subject: [PATCH v2 05/25] io_uring/kbuf: support kernel-managed buffer rings in buffer selection
config: nios2-allnoconfig (https://download.01.org/0day-ci/archive/20251221/202512212111.RWRN4N7A-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251221/202512212111.RWRN4N7A-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512212111.RWRN4N7A-lkp@intel.com/
All warnings (new ones prefixed by >>):
io_uring/kbuf.c: In function 'io_ring_buffer_select':
>> io_uring/kbuf.c:210:29: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
210 | sel.kaddr = (void *)buf->addr;
| ^
io_uring/kbuf.c: In function 'io_setup_kmbuf_ring':
io_uring/kbuf.c:826:29: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
826 | buf->addr = (u64)buf_region;
| ^
vim +210 io_uring/kbuf.c
183
184 static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
185 struct io_buffer_list *bl,
186 unsigned int issue_flags)
187 {
188 struct io_uring_buf_ring *br = bl->buf_ring;
189 __u16 tail, head = bl->head;
190 struct io_br_sel sel = { };
191 struct io_uring_buf *buf;
192 u32 buf_len;
193
194 tail = smp_load_acquire(&br->tail);
195 if (unlikely(tail == head))
196 return sel;
197
198 if (head + 1 == tail)
199 req->flags |= REQ_F_BL_EMPTY;
200
201 buf = io_ring_head_to_buf(br, head, bl->mask);
202 buf_len = READ_ONCE(buf->len);
203 if (*len == 0 || *len > buf_len)
204 *len = buf_len;
205 req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
206 req->buf_index = READ_ONCE(buf->bid);
207 sel.buf_list = bl;
208 sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
209 if (bl->flags & IOBL_KERNEL_MANAGED)
> 210 sel.kaddr = (void *)buf->addr;
211 else
212 sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
213
214 if (io_should_commit(req, bl, issue_flags)) {
215 io_kbuf_commit(req, sel.buf_list, *len, 1);
216 sel.buf_list = NULL;
217 }
218 return sel;
219 }
220
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH v2 06/25] io_uring/kbuf: add buffer ring pinning/unpinning
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (4 preceding siblings ...)
2025-12-18 8:32 ` [PATCH v2 05/25] io_uring/kbuf: support kernel-managed buffer rings in buffer selection Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 14:21 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 07/25] io_uring/kbuf: add recycling for kernel managed buffer rings Joanne Koong
` (18 subsequent siblings)
24 siblings, 1 reply; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add kernel APIs to pin and unpin buffer rings, preventing userspace from
unregistering a buffer ring while it is pinned by the kernel.
This provides a mechanism for kernel subsystems to safely access buffer
ring contents while ensuring the buffer ring remains valid. A pinned
buffer ring cannot be unregistered until explicitly unpinned. On the
userspace side, trying to unregister a pinned buffer will return -EBUSY.
This is a preparatory change for upcoming fuse usage of kernel-managed
buffer rings. It is necessary for fuse to pin the buffer ring because
fuse may need to select a buffer in atomic contexts, which it can only
do so by using the underlying buffer list pointer.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 17 +++++++++++++
io_uring/kbuf.c | 48 ++++++++++++++++++++++++++++++++++++
io_uring/kbuf.h | 10 ++++++++
io_uring/uring_cmd.c | 18 ++++++++++++++
4 files changed, 93 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 375fd048c4cb..424f071f42e5 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -84,6 +84,10 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
struct io_br_sel *sel, unsigned int issue_flags);
+int io_uring_cmd_buf_ring_pin(struct io_uring_cmd *ioucmd, unsigned buf_group,
+ unsigned issue_flags, struct io_buffer_list **bl);
+int io_uring_cmd_buf_ring_unpin(struct io_uring_cmd *ioucmd, unsigned buf_group,
+ unsigned issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -126,6 +130,19 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
{
return true;
}
+static inline int io_uring_cmd_buf_ring_pin(struct io_uring_cmd *ioucmd,
+ unsigned buf_group,
+ unsigned issue_flags,
+ struct io_buffer_list **bl)
+{
+ return -EOPNOTSUPP;
+}
+static inline int io_uring_cmd_buf_ring_unpin(struct io_uring_cmd *ioucmd,
+ unsigned buf_group,
+ unsigned issue_flags)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index c98cecb56b8c..49dc75f24432 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -238,6 +238,52 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
return sel;
}
+int io_kbuf_ring_pin(struct io_kiocb *req, unsigned buf_group,
+ unsigned issue_flags, struct io_buffer_list **bl)
+{
+ struct io_buffer_list *buffer_list;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret = -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ buffer_list = io_buffer_get_list(ctx, buf_group);
+ if (likely(buffer_list) && likely(buffer_list->flags & IOBL_BUF_RING)) {
+ if (unlikely(buffer_list->flags & IOBL_PINNED)) {
+ ret = -EALREADY;
+ } else {
+ buffer_list->flags |= IOBL_PINNED;
+ ret = 0;
+ *bl = buffer_list;
+ }
+ }
+
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(io_kbuf_ring_pin);
+
+int io_kbuf_ring_unpin(struct io_kiocb *req, unsigned buf_group,
+ unsigned issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ int ret = -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, buf_group);
+ if (likely(bl) && likely(bl->flags & IOBL_BUF_RING) &&
+ likely(bl->flags & IOBL_PINNED)) {
+ bl->flags &= ~IOBL_PINNED;
+ ret = 0;
+ }
+
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(io_kbuf_ring_unpin);
+
/* cap it at a reasonable 256, will be one page even for 4K */
#define PEEK_MAX_IMPORT 256
@@ -744,6 +790,8 @@ int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOENT;
if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
+ if (bl->flags & IOBL_PINNED)
+ return -EBUSY;
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->io_bl_xa, bl->bgid);
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 11d165888b8e..c4368f35cf11 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -12,6 +12,11 @@ enum {
IOBL_INC = 2,
/* buffers are kernel managed */
IOBL_KERNEL_MANAGED = 4,
+ /*
+ * buffer ring is pinned and cannot be unregistered by userspace until
+ * it has been unpinned
+ */
+ IOBL_PINNED = 8,
};
struct io_buffer_list {
@@ -136,4 +141,9 @@ static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
return 0;
return __io_put_kbufs(req, bl, len, nbufs);
}
+
+int io_kbuf_ring_pin(struct io_kiocb *req, unsigned buf_group,
+ unsigned issue_flags, struct io_buffer_list **bl);
+int io_kbuf_ring_unpin(struct io_kiocb *req, unsigned buf_group,
+ unsigned issue_flags);
#endif
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 197474911f04..8ac79ead4158 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -398,3 +398,21 @@ bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
return true;
}
EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe);
+
+int io_uring_cmd_buf_ring_pin(struct io_uring_cmd *ioucmd, unsigned buf_group,
+ unsigned issue_flags, struct io_buffer_list **bl)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ return io_kbuf_ring_pin(req, buf_group, issue_flags, bl);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_buf_ring_pin);
+
+int io_uring_cmd_buf_ring_unpin(struct io_uring_cmd *ioucmd, unsigned buf_group,
+ unsigned issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ return io_kbuf_ring_unpin(req, buf_group, issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_buf_ring_unpin);
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH v2 06/25] io_uring/kbuf: add buffer ring pinning/unpinning
2025-12-18 8:33 ` [PATCH v2 06/25] io_uring/kbuf: add buffer ring pinning/unpinning Joanne Koong
@ 2025-12-18 14:21 ` Joanne Koong
0 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 14:21 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
On Thu, Dec 18, 2025 at 4:34 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> Add kernel APIs to pin and unpin buffer rings, preventing userspace from
> unregistering a buffer ring while it is pinned by the kernel.
>
> This provides a mechanism for kernel subsystems to safely access buffer
> ring contents while ensuring the buffer ring remains valid. A pinned
> buffer ring cannot be unregistered until explicitly unpinned. On the
> userspace side, trying to unregister a pinned buffer will return -EBUSY.
>
> This is a preparatory change for upcoming fuse usage of kernel-managed
> buffer rings. It is necessary for fuse to pin the buffer ring because
> fuse may need to select a buffer in atomic contexts, which it can only
> do so by using the underlying buffer list pointer.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
> include/linux/io_uring/cmd.h | 17 +++++++++++++
> io_uring/kbuf.c | 48 ++++++++++++++++++++++++++++++++++++
> io_uring/kbuf.h | 10 ++++++++
> io_uring/uring_cmd.c | 18 ++++++++++++++
> 4 files changed, 93 insertions(+)
>
> diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
> index c98cecb56b8c..49dc75f24432 100644
> --- a/io_uring/kbuf.c
> +++ b/io_uring/kbuf.c
> @@ -238,6 +238,52 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
> return sel;
> }
>
> +int io_kbuf_ring_pin(struct io_kiocb *req, unsigned buf_group,
> + unsigned issue_flags, struct io_buffer_list **bl)
> +{
> + struct io_buffer_list *buffer_list;
> + struct io_ring_ctx *ctx = req->ctx;
> + int ret = -EINVAL;
> +
> + io_ring_submit_lock(ctx, issue_flags);
> +
> + buffer_list = io_buffer_get_list(ctx, buf_group);
> + if (likely(buffer_list) && likely(buffer_list->flags & IOBL_BUF_RING)) {
> + if (unlikely(buffer_list->flags & IOBL_PINNED)) {
> + ret = -EALREADY;
> + } else {
> + buffer_list->flags |= IOBL_PINNED;
> + ret = 0;
> + *bl = buffer_list;
> + }
> + }
> +
> + io_ring_submit_unlock(ctx, issue_flags);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(io_kbuf_ring_pin);
This EXPORT_SYMBOL_GPL (and the one below) are remnants from v1 and
are no longer necessary. I"ll remove these for the next version.
> +
> +int io_kbuf_ring_unpin(struct io_kiocb *req, unsigned buf_group,
> + unsigned issue_flags)
> +{
> + struct io_ring_ctx *ctx = req->ctx;
> + struct io_buffer_list *bl;
> + int ret = -EINVAL;
> +
> + io_ring_submit_lock(ctx, issue_flags);
> +
> + bl = io_buffer_get_list(ctx, buf_group);
> + if (likely(bl) && likely(bl->flags & IOBL_BUF_RING) &&
> + likely(bl->flags & IOBL_PINNED)) {
> + bl->flags &= ~IOBL_PINNED;
> + ret = 0;
> + }
> +
> + io_ring_submit_unlock(ctx, issue_flags);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(io_kbuf_ring_unpin);
> +
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH v2 07/25] io_uring/kbuf: add recycling for kernel managed buffer rings
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (5 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 06/25] io_uring/kbuf: add buffer ring pinning/unpinning Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 08/25] io_uring: add io_uring_cmd_fixed_index_get() and io_uring_cmd_fixed_index_put() Joanne Koong
` (17 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add an interface for buffers to be recycled back into a kernel-managed
buffer ring.
This is a preparatory patch for fuse over io-uring.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 13 +++++++++++
io_uring/kbuf.c | 42 ++++++++++++++++++++++++++++++++++++
io_uring/kbuf.h | 3 +++
io_uring/uring_cmd.c | 11 ++++++++++
4 files changed, 69 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 424f071f42e5..7169a2a9a744 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -88,6 +88,11 @@ int io_uring_cmd_buf_ring_pin(struct io_uring_cmd *ioucmd, unsigned buf_group,
unsigned issue_flags, struct io_buffer_list **bl);
int io_uring_cmd_buf_ring_unpin(struct io_uring_cmd *ioucmd, unsigned buf_group,
unsigned issue_flags);
+
+int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *cmd,
+ unsigned int buf_group, u64 addr,
+ unsigned int len, unsigned int bid,
+ unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -143,6 +148,14 @@ static inline int io_uring_cmd_buf_ring_unpin(struct io_uring_cmd *ioucmd,
{
return -EOPNOTSUPP;
}
+static inline int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *cmd,
+ unsigned int buf_group,
+ u64 addr, unsigned int len,
+ unsigned int bid,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 49dc75f24432..f494d896c17e 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -101,6 +101,48 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
req->kbuf = NULL;
}
+int io_kmbuf_recycle(struct io_kiocb *req, unsigned int bgid, u64 addr,
+ unsigned int len, unsigned int bid,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_buf_ring *br;
+ struct io_uring_buf *buf;
+ struct io_buffer_list *bl;
+ int ret = -EINVAL;
+
+ if (WARN_ON_ONCE(req->flags & REQ_F_BUFFERS_COMMIT))
+ return ret;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, bgid);
+
+ if (WARN_ON_ONCE(!(bl->flags & IOBL_BUF_RING)) ||
+ WARN_ON_ONCE(!(bl->flags & IOBL_KERNEL_MANAGED)))
+ goto done;
+
+ br = bl->buf_ring;
+
+ if (WARN_ON_ONCE((br->tail - bl->head) >= bl->nr_entries))
+ goto done;
+
+ buf = &br->bufs[(br->tail) & bl->mask];
+
+ buf->addr = addr;
+ buf->len = len;
+ buf->bid = bid;
+
+ req->flags &= ~REQ_F_BUFFER_RING;
+
+ br->tail++;
+ ret = 0;
+
+done:
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index c4368f35cf11..4d8b7491628e 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -146,4 +146,7 @@ int io_kbuf_ring_pin(struct io_kiocb *req, unsigned buf_group,
unsigned issue_flags, struct io_buffer_list **bl);
int io_kbuf_ring_unpin(struct io_kiocb *req, unsigned buf_group,
unsigned issue_flags);
+int io_kmbuf_recycle(struct io_kiocb *req, unsigned int bgid, u64 addr,
+ unsigned int len, unsigned int bid,
+ unsigned int issue_flags);
#endif
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 8ac79ead4158..b6b675010bfd 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -416,3 +416,14 @@ int io_uring_cmd_buf_ring_unpin(struct io_uring_cmd *ioucmd, unsigned buf_group,
return io_kbuf_ring_unpin(req, buf_group, issue_flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_buf_ring_unpin);
+
+int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *ioucmd,
+ unsigned int buf_group, u64 addr,
+ unsigned int len, unsigned int bid,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ return io_kmbuf_recycle(req, buf_group, addr, len, bid, issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_kmbuffer_recycle);
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 08/25] io_uring: add io_uring_cmd_fixed_index_get() and io_uring_cmd_fixed_index_put()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (6 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 07/25] io_uring/kbuf: add recycling for kernel managed buffer rings Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 09/25] io_uring/kbuf: add io_uring_cmd_is_kmbuf_ring() Joanne Koong
` (16 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add two new helpers, io_uring_cmd_fixed_index_get() and
io_uring_cmd_fixed_index_put(). io_uring_cmd_fixed_index_get()
constructs an iter for a fixed buffer at a given index and acquires a
refcount on the underlying node. io_uring_cmd_fixed_index_put()
decrements this refcount. The caller is responsible for ensuring
io_uring_cmd_fixed_index_put() is properly called for releasing the
refcount after it is done using the iter it obtained through
io_uring_cmd_fixed_index_get().
This is a preparatory patch needed for fuse-over-io-uring support, as
the metadata for fuse requests will be stored at the last index, which
will be different from the buf index set on the sqe.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 20 +++++++++++
io_uring/rsrc.c | 65 ++++++++++++++++++++++++++++++++++++
io_uring/rsrc.h | 5 +++
io_uring/uring_cmd.c | 21 ++++++++++++
4 files changed, 111 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 7169a2a9a744..2988592e045c 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -44,6 +44,12 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
size_t uvec_segs,
int ddir, struct iov_iter *iter,
unsigned issue_flags);
+int io_uring_cmd_fixed_index_get(struct io_uring_cmd *ioucmd, u16 buf_index,
+ unsigned int off, size_t len, int ddir,
+ struct iov_iter *iter,
+ unsigned int issue_flags);
+int io_uring_cmd_fixed_index_put(struct io_uring_cmd *ioucmd, u16 buf_index,
+ unsigned int issue_flags);
/*
* Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd
@@ -109,6 +115,20 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
{
return -EOPNOTSUPP;
}
+static inline int io_uring_cmd_fixed_index_get(struct io_uring_cmd *ioucmd,
+ u16 buf_index, unsigned int off,
+ size_t len, int ddir,
+ struct iov_iter *iter,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
+static inline int io_uring_cmd_fixed_index_put(struct io_uring_cmd *ioucmd,
+ u16 buf_index,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret,
u64 ret2, unsigned issue_flags, bool is_cqe32)
{
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a63474b331bf..a141aaeb099d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1151,6 +1151,71 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
}
+int io_reg_buf_index_get(struct io_kiocb *req, struct iov_iter *iter,
+ u16 buf_index, unsigned int off, size_t len,
+ int ddir, unsigned issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
+ struct io_mapped_ubuf *imu;
+ u64 addr;
+ int err;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ node = io_rsrc_node_lookup(&ctx->buf_table, buf_index);
+ if (!node) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return -EINVAL;
+ }
+
+ node->refs++;
+
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ imu = node->buf;
+ if (!imu) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ if (check_add_overflow(imu->ubuf, off, &addr)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ err = io_import_fixed(ddir, iter, imu, addr, len);
+ if (err)
+ goto error;
+
+ return 0;
+
+error:
+ io_reg_buf_index_put(req, buf_index, issue_flags);
+ return err;
+}
+
+int io_reg_buf_index_put(struct io_kiocb *req, u16 buf_index,
+ unsigned issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ node = io_rsrc_node_lookup(&ctx->buf_table, buf_index);
+ if (WARN_ON_ONCE(!node)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return -EFAULT;
+ }
+
+ io_put_rsrc_node(ctx, node);
+
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ return 0;
+}
+
/* Lock two rings at once. The rings must be different! */
static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
{
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index d603f6a47f5e..16f4bab9582b 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -64,6 +64,11 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
u64 buf_addr, size_t len, int ddir,
unsigned issue_flags);
+int io_reg_buf_index_get(struct io_kiocb *req, struct iov_iter *iter,
+ u16 buf_index, unsigned int off, size_t len,
+ int ddir, unsigned issue_flags);
+int io_reg_buf_index_put(struct io_kiocb *req, u16 buf_index,
+ unsigned issue_flags);
int io_import_reg_vec(int ddir, struct iov_iter *iter,
struct io_kiocb *req, struct iou_vec *vec,
unsigned nr_iovs, unsigned issue_flags);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index b6b675010bfd..ee95d1102505 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -314,6 +314,27 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec);
+int io_uring_cmd_fixed_index_get(struct io_uring_cmd *ioucmd, u16 buf_index,
+ unsigned int off, size_t len, int ddir,
+ struct iov_iter *iter,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ return io_reg_buf_index_get(req, iter, buf_index, off, len, ddir,
+ issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_fixed_index_get);
+
+int io_uring_cmd_fixed_index_put(struct io_uring_cmd *ioucmd, u16 buf_index,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ return io_reg_buf_index_put(req, buf_index, issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_fixed_index_put);
+
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 09/25] io_uring/kbuf: add io_uring_cmd_is_kmbuf_ring()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (7 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 08/25] io_uring: add io_uring_cmd_fixed_index_get() and io_uring_cmd_fixed_index_put() Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 10/25] io_uring/kbuf: export io_ring_buffer_select() Joanne Koong
` (15 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
io_uring_cmd_is_kmbuf_ring() returns true if there is a kernel-managed
buffer ring at the specified buffer group.
This is a preparatory patch for upcoming fuse kernel-managed buffer
support, which needs to ensure the buffer ring registered by the server
is a kernel-managed buffer ring.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 9 +++++++++
io_uring/kbuf.c | 19 +++++++++++++++++++
io_uring/kbuf.h | 2 ++
io_uring/uring_cmd.c | 9 +++++++++
4 files changed, 39 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 2988592e045c..a94f1dbc89c7 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -99,6 +99,9 @@ int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *cmd,
unsigned int buf_group, u64 addr,
unsigned int len, unsigned int bid,
unsigned int issue_flags);
+
+int io_uring_cmd_is_kmbuf_ring(struct io_uring_cmd *ioucmd,
+ unsigned int buf_group, unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -176,6 +179,12 @@ static inline int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
+static inline int io_uring_cmd_is_kmbuf_ring(struct io_uring_cmd *ioucmd,
+ unsigned int buf_group,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index f494d896c17e..b16f6a6aa872 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -960,3 +960,22 @@ int io_register_kmbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
+
+bool io_is_kmbuf_ring(struct io_kiocb *req, unsigned int buf_group,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ bool is_kmbuf_ring = false;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, buf_group);
+ if (likely(bl) && (bl->flags & IOBL_KERNEL_MANAGED)) {
+ WARN_ON_ONCE(!(bl->flags & IOBL_BUF_RING));
+ is_kmbuf_ring = true;
+ }
+
+ io_ring_submit_unlock(ctx, issue_flags);
+ return is_kmbuf_ring;
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 4d8b7491628e..68c4c78fbb44 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -149,4 +149,6 @@ int io_kbuf_ring_unpin(struct io_kiocb *req, unsigned buf_group,
int io_kmbuf_recycle(struct io_kiocb *req, unsigned int bgid, u64 addr,
unsigned int len, unsigned int bid,
unsigned int issue_flags);
+bool io_is_kmbuf_ring(struct io_kiocb *req, unsigned int buf_group,
+ unsigned int issue_flags);
#endif
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ee95d1102505..4534710252da 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -448,3 +448,12 @@ int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *ioucmd,
return io_kmbuf_recycle(req, buf_group, addr, len, bid, issue_flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_kmbuffer_recycle);
+
+int io_uring_cmd_is_kmbuf_ring(struct io_uring_cmd *ioucmd,
+ unsigned int buf_group, unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+
+ return io_is_kmbuf_ring(req, buf_group, issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_is_kmbuf_ring);
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 10/25] io_uring/kbuf: export io_ring_buffer_select()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (8 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 09/25] io_uring/kbuf: add io_uring_cmd_is_kmbuf_ring() Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 11/25] io_uring/kbuf: return buffer id in buffer selection Joanne Koong
` (14 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Export io_ring_buffer_select() so that it may be used by callers who
pass in a pinned bufring without needing to grab the io_uring mutex.
This is a preparatory patch that will be needed by fuse io-uring, which
will need to select a buffer from a kernel-managed bufring while the
uring mutex may already be held by in-progress commits, and may need to
select a buffer in atomic contexts.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/buf.h | 25 +++++++++++++++++++++++++
io_uring/kbuf.c | 8 +++++---
2 files changed, 30 insertions(+), 3 deletions(-)
create mode 100644 include/linux/io_uring/buf.h
diff --git a/include/linux/io_uring/buf.h b/include/linux/io_uring/buf.h
new file mode 100644
index 000000000000..3f7426ced3eb
--- /dev/null
+++ b/include/linux/io_uring/buf.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_BUF_H
+#define _LINUX_IO_URING_BUF_H
+
+#include <linux/io_uring_types.h>
+
+#if defined(CONFIG_IO_URING)
+struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags);
+#else
+static inline struct io_br_sel io_ring_buffer_select(struct io_kiocb *req,
+ size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
+{
+ struct io_br_sel sel = {
+ .val = -EOPNOTSUPP,
+ };
+
+ return sel;
+}
+#endif /* CONFIG_IO_URING */
+
+#endif /* _LINUX_IO_URING_BUF_H */
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index b16f6a6aa872..3b1f6296f581 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -9,6 +9,7 @@
#include <linux/poll.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
+#include <linux/io_uring/buf.h>
#include <uapi/linux/io_uring.h>
@@ -223,9 +224,9 @@ static bool io_should_commit(struct io_kiocb *req, struct io_buffer_list *bl,
return false;
}
-static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
- struct io_buffer_list *bl,
- unsigned int issue_flags)
+struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
__u16 tail, head = bl->head;
@@ -259,6 +260,7 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
}
return sel;
}
+EXPORT_SYMBOL_GPL(io_ring_buffer_select);
struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned buf_group, unsigned int issue_flags)
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 11/25] io_uring/kbuf: return buffer id in buffer selection
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (9 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 10/25] io_uring/kbuf: export io_ring_buffer_select() Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 12/25] io_uring/cmd: set selected buffer index in __io_uring_cmd_done() Joanne Koong
` (13 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Return the id of the selected buffer in io_buffer_select(). This is
needed for kernel-managed buffer rings to later recycle the selected
buffer.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 2 +-
include/linux/io_uring_types.h | 2 ++
io_uring/kbuf.c | 8 +++++---
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index a94f1dbc89c7..61c4ca863ef6 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -77,7 +77,7 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
/*
* Select a buffer from the provided buffer group for multishot uring_cmd.
- * Returns the selected buffer address and size.
+ * Returns the selected buffer address, size, and id.
*/
struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
unsigned buf_group, size_t *len,
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 36fac08db636..52fce7eba400 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -100,6 +100,8 @@ struct io_br_sel {
void *kaddr;
};
ssize_t val;
+ /* id of the selected buffer */
+ unsigned buf_id;
};
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3b1f6296f581..d9beebb3aed2 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -248,7 +248,7 @@ struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
req->buf_index = READ_ONCE(buf->bid);
sel.buf_list = bl;
- sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
+ sel.buf_id = req->buf_index;
if (bl->flags & IOBL_KERNEL_MANAGED)
sel.kaddr = (void *)buf->addr;
else
@@ -273,10 +273,12 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
- if (bl->flags & IOBL_BUF_RING)
+ if (bl->flags & IOBL_BUF_RING) {
sel = io_ring_buffer_select(req, len, bl, issue_flags);
- else
+ } else {
sel.addr = io_provided_buffer_select(req, len, bl);
+ sel.buf_id = req->buf_index;
+ }
}
io_ring_submit_unlock(req->ctx, issue_flags);
return sel;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 12/25] io_uring/cmd: set selected buffer index in __io_uring_cmd_done()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (10 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 11/25] io_uring/kbuf: return buffer id in buffer selection Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 13/25] fuse: refactor io-uring logic for getting next fuse request Joanne Koong
` (12 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
When uring_cmd operations select a buffer, the completion queue entry
should indicate which buffer was selected.
Set IORING_CQE_F_BUFFER on the completed entry and encode the buffer
index if a buffer was selected.
This will be needed for fuse, which needs to relay to userspace which
selected buffer contains the data.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
io_uring/uring_cmd.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 4534710252da..c78a06845cbc 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -142,6 +142,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
unsigned issue_flags, bool is_cqe32)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ u32 cflags = 0;
if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT))
return;
@@ -151,7 +152,10 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
if (ret < 0)
req_set_fail(req);
- io_req_set_res(req, ret, 0);
+ if (req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_BUFFER_RING))
+ cflags |= IORING_CQE_F_BUFFER |
+ (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+ io_req_set_res(req, ret, cflags);
if (is_cqe32) {
if (req->ctx->flags & IORING_SETUP_CQE_MIXED)
req->cqe.flags |= IORING_CQE_F_32;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 13/25] fuse: refactor io-uring logic for getting next fuse request
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (11 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 12/25] io_uring/cmd: set selected buffer index in __io_uring_cmd_done() Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 14/25] fuse: refactor io-uring header copying to ring Joanne Koong
` (11 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Simplify the logic for getting the next fuse request.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
---
fs/fuse/dev_uring.c | 78 ++++++++++++++++-----------------------------
1 file changed, 28 insertions(+), 50 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 5ceb217ced1b..1efee4391af5 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -714,34 +714,6 @@ static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
return err;
}
-/*
- * Write data to the ring buffer and send the request to userspace,
- * userspace will read it
- * This is comparable with classical read(/dev/fuse)
- */
-static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
- struct fuse_req *req,
- unsigned int issue_flags)
-{
- struct fuse_ring_queue *queue = ent->queue;
- int err;
- struct io_uring_cmd *cmd;
-
- err = fuse_uring_prepare_send(ent, req);
- if (err)
- return err;
-
- spin_lock(&queue->lock);
- cmd = ent->cmd;
- ent->cmd = NULL;
- ent->state = FRRS_USERSPACE;
- list_move_tail(&ent->list, &queue->ent_in_userspace);
- spin_unlock(&queue->lock);
-
- io_uring_cmd_done(cmd, 0, issue_flags);
- return 0;
-}
-
/*
* Make a ring entry available for fuse_req assignment
*/
@@ -838,11 +810,13 @@ static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
}
/*
- * Get the next fuse req and send it
+ * Get the next fuse req.
+ *
+ * Returns true if the next fuse request has been assigned to the ent.
+ * Else, there is no next fuse request and this returns false.
*/
-static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
- struct fuse_ring_queue *queue,
- unsigned int issue_flags)
+static bool fuse_uring_get_next_fuse_req(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue)
{
int err;
struct fuse_req *req;
@@ -854,10 +828,12 @@ static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
spin_unlock(&queue->lock);
if (req) {
- err = fuse_uring_send_next_to_ring(ent, req, issue_flags);
+ err = fuse_uring_prepare_send(ent, req);
if (err)
goto retry;
}
+
+ return req != NULL;
}
static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
@@ -875,6 +851,20 @@ static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
return 0;
}
+static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
+ ssize_t ret, unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ ent->state = FRRS_USERSPACE;
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
+ ent->cmd = NULL;
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, ret, issue_flags);
+}
+
/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
struct fuse_conn *fc)
@@ -946,7 +936,8 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
* and fetching is done in one step vs legacy fuse, which has separated
* read (fetch request) and write (commit result).
*/
- fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ if (fuse_uring_get_next_fuse_req(ent, queue))
+ fuse_uring_send(ent, cmd, 0, issue_flags);
return 0;
}
@@ -1194,20 +1185,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
return -EIOCBQUEUED;
}
-static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
- ssize_t ret, unsigned int issue_flags)
-{
- struct fuse_ring_queue *queue = ent->queue;
-
- spin_lock(&queue->lock);
- ent->state = FRRS_USERSPACE;
- list_move_tail(&ent->list, &queue->ent_in_userspace);
- ent->cmd = NULL;
- spin_unlock(&queue->lock);
-
- io_uring_cmd_done(cmd, ret, issue_flags);
-}
-
/*
* This prepares and sends the ring request in fuse-uring task context.
* User buffers are not mapped yet - the application does not have permission
@@ -1224,8 +1201,9 @@ static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw)
if (!tw.cancel) {
err = fuse_uring_prepare_send(ent, ent->fuse_req);
if (err) {
- fuse_uring_next_fuse_req(ent, queue, issue_flags);
- return;
+ if (!fuse_uring_get_next_fuse_req(ent, queue))
+ return;
+ err = 0;
}
} else {
err = -ECANCELED;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 14/25] fuse: refactor io-uring header copying to ring
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (12 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 13/25] fuse: refactor io-uring logic for getting next fuse request Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 15/25] fuse: refactor io-uring header copying from ring Joanne Koong
` (10 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Move header copying to ring logic into a new copy_header_to_ring()
function. This consolidates error handling.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/dev_uring.c | 39 +++++++++++++++++++++------------------
1 file changed, 21 insertions(+), 18 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 1efee4391af5..7962a9876031 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -575,6 +575,18 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
return err;
}
+static __always_inline int copy_header_to_ring(void __user *ring,
+ const void *header,
+ size_t header_size)
+{
+ if (copy_to_user(ring, header, header_size)) {
+ pr_info_ratelimited("Copying header to ring failed.\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
struct fuse_req *req,
struct fuse_ring_ent *ent)
@@ -637,13 +649,11 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
* Some op code have that as zero size.
*/
if (args->in_args[0].size > 0) {
- err = copy_to_user(&ent->headers->op_in, in_args->value,
- in_args->size);
- if (err) {
- pr_info_ratelimited(
- "Copying the header failed.\n");
- return -EFAULT;
- }
+ err = copy_header_to_ring(&ent->headers->op_in,
+ in_args->value,
+ in_args->size);
+ if (err)
+ return err;
}
in_args++;
num_args--;
@@ -659,9 +669,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
}
ent_in_out.payload_sz = cs.ring.copied_sz;
- err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out,
- sizeof(ent_in_out));
- return err ? -EFAULT : 0;
+ return copy_header_to_ring(&ent->headers->ring_ent_in_out, &ent_in_out,
+ sizeof(ent_in_out));
}
static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
@@ -690,14 +699,8 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
}
/* copy fuse_in_header */
- err = copy_to_user(&ent->headers->in_out, &req->in.h,
- sizeof(req->in.h));
- if (err) {
- err = -EFAULT;
- return err;
- }
-
- return 0;
+ return copy_header_to_ring(&ent->headers->in_out, &req->in.h,
+ sizeof(req->in.h));
}
static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 15/25] fuse: refactor io-uring header copying from ring
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (13 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 14/25] fuse: refactor io-uring header copying to ring Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 16/25] fuse: use enum types for header copying Joanne Koong
` (9 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Move header copying from ring logic into a new copy_header_from_ring()
function. This consolidates error handling.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/dev_uring.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 7962a9876031..e8ee51bfa5fc 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -587,6 +587,18 @@ static __always_inline int copy_header_to_ring(void __user *ring,
return 0;
}
+static __always_inline int copy_header_from_ring(void *header,
+ const void __user *ring,
+ size_t header_size)
+{
+ if (copy_from_user(header, ring, header_size)) {
+ pr_info_ratelimited("Copying header from ring failed.\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
struct fuse_req *req,
struct fuse_ring_ent *ent)
@@ -597,10 +609,10 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
int err;
struct fuse_uring_ent_in_out ring_in_out;
- err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out,
- sizeof(ring_in_out));
+ err = copy_header_from_ring(&ring_in_out, &ent->headers->ring_ent_in_out,
+ sizeof(ring_in_out));
if (err)
- return -EFAULT;
+ return err;
err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
&iter);
@@ -794,10 +806,10 @@ static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
struct fuse_conn *fc = ring->fc;
ssize_t err = 0;
- err = copy_from_user(&req->out.h, &ent->headers->in_out,
- sizeof(req->out.h));
+ err = copy_header_from_ring(&req->out.h, &ent->headers->in_out,
+ sizeof(req->out.h));
if (err) {
- req->out.h.error = -EFAULT;
+ req->out.h.error = err;
goto out;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 16/25] fuse: use enum types for header copying
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (14 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 15/25] fuse: refactor io-uring header copying from ring Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 17/25] fuse: refactor setting up copy state for payload copying Joanne Koong
` (8 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Use enum types to identify which part of the header needs to be copied.
This improves the interface and will simplify both kernel-space and
user-space header addresses when kernel-managed buffer rings are added.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
---
fs/fuse/dev_uring.c | 57 +++++++++++++++++++++++++++++++++++++--------
1 file changed, 47 insertions(+), 10 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index e8ee51bfa5fc..d16f6b3489c1 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -31,6 +31,15 @@ struct fuse_uring_pdu {
static const struct fuse_iqueue_ops fuse_io_uring_ops;
+enum fuse_uring_header_type {
+ /* struct fuse_in_header / struct fuse_out_header */
+ FUSE_URING_HEADER_IN_OUT,
+ /* per op code header */
+ FUSE_URING_HEADER_OP,
+ /* struct fuse_uring_ent_in_out header */
+ FUSE_URING_HEADER_RING_ENT,
+};
+
static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
struct fuse_ring_ent *ring_ent)
{
@@ -575,10 +584,32 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
return err;
}
-static __always_inline int copy_header_to_ring(void __user *ring,
+static void __user *get_user_ring_header(struct fuse_ring_ent *ent,
+ enum fuse_uring_header_type type)
+{
+ switch (type) {
+ case FUSE_URING_HEADER_IN_OUT:
+ return &ent->headers->in_out;
+ case FUSE_URING_HEADER_OP:
+ return &ent->headers->op_in;
+ case FUSE_URING_HEADER_RING_ENT:
+ return &ent->headers->ring_ent_in_out;
+ }
+
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+static __always_inline int copy_header_to_ring(struct fuse_ring_ent *ent,
+ enum fuse_uring_header_type type,
const void *header,
size_t header_size)
{
+ void __user *ring = get_user_ring_header(ent, type);
+
+ if (!ring)
+ return -EINVAL;
+
if (copy_to_user(ring, header, header_size)) {
pr_info_ratelimited("Copying header to ring failed.\n");
return -EFAULT;
@@ -587,10 +618,16 @@ static __always_inline int copy_header_to_ring(void __user *ring,
return 0;
}
-static __always_inline int copy_header_from_ring(void *header,
- const void __user *ring,
+static __always_inline int copy_header_from_ring(struct fuse_ring_ent *ent,
+ enum fuse_uring_header_type type,
+ void *header,
size_t header_size)
{
+ const void __user *ring = get_user_ring_header(ent, type);
+
+ if (!ring)
+ return -EINVAL;
+
if (copy_from_user(header, ring, header_size)) {
pr_info_ratelimited("Copying header from ring failed.\n");
return -EFAULT;
@@ -609,8 +646,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
int err;
struct fuse_uring_ent_in_out ring_in_out;
- err = copy_header_from_ring(&ring_in_out, &ent->headers->ring_ent_in_out,
- sizeof(ring_in_out));
+ err = copy_header_from_ring(ent, FUSE_URING_HEADER_RING_ENT,
+ &ring_in_out, sizeof(ring_in_out));
if (err)
return err;
@@ -661,7 +698,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
* Some op code have that as zero size.
*/
if (args->in_args[0].size > 0) {
- err = copy_header_to_ring(&ent->headers->op_in,
+ err = copy_header_to_ring(ent, FUSE_URING_HEADER_OP,
in_args->value,
in_args->size);
if (err)
@@ -681,8 +718,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
}
ent_in_out.payload_sz = cs.ring.copied_sz;
- return copy_header_to_ring(&ent->headers->ring_ent_in_out, &ent_in_out,
- sizeof(ent_in_out));
+ return copy_header_to_ring(ent, FUSE_URING_HEADER_RING_ENT,
+ &ent_in_out, sizeof(ent_in_out));
}
static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
@@ -711,7 +748,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
}
/* copy fuse_in_header */
- return copy_header_to_ring(&ent->headers->in_out, &req->in.h,
+ return copy_header_to_ring(ent, FUSE_URING_HEADER_IN_OUT, &req->in.h,
sizeof(req->in.h));
}
@@ -806,7 +843,7 @@ static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
struct fuse_conn *fc = ring->fc;
ssize_t err = 0;
- err = copy_header_from_ring(&req->out.h, &ent->headers->in_out,
+ err = copy_header_from_ring(ent, FUSE_URING_HEADER_IN_OUT, &req->out.h,
sizeof(req->out.h));
if (err) {
req->out.h.error = err;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 17/25] fuse: refactor setting up copy state for payload copying
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (15 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 16/25] fuse: use enum types for header copying Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 18/25] fuse: support buffer copying for kernel addresses Joanne Koong
` (7 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add a new helper function setup_fuse_copy_state() to contain the logic
for setting up the copy state for payload copying.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Bernd Schubert <bschubert@ddn.com>
---
fs/fuse/dev_uring.c | 38 ++++++++++++++++++++++++--------------
1 file changed, 24 insertions(+), 14 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index d16f6b3489c1..b57871f92d08 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -636,6 +636,27 @@ static __always_inline int copy_header_from_ring(struct fuse_ring_ent *ent,
return 0;
}
+static int setup_fuse_copy_state(struct fuse_copy_state *cs,
+ struct fuse_ring *ring, struct fuse_req *req,
+ struct fuse_ring_ent *ent, int dir,
+ struct iov_iter *iter)
+{
+ int err;
+
+ err = import_ubuf(dir, ent->payload, ring->max_payload_sz, iter);
+ if (err) {
+ pr_info_ratelimited("fuse: Import of user buffer failed\n");
+ return err;
+ }
+
+ fuse_copy_init(cs, dir == ITER_DEST, iter);
+
+ cs->is_uring = true;
+ cs->req = req;
+
+ return 0;
+}
+
static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
struct fuse_req *req,
struct fuse_ring_ent *ent)
@@ -651,15 +672,10 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
if (err)
return err;
- err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
- &iter);
+ err = setup_fuse_copy_state(&cs, ring, req, ent, ITER_SOURCE, &iter);
if (err)
return err;
- fuse_copy_init(&cs, false, &iter);
- cs.is_uring = true;
- cs.req = req;
-
err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
fuse_copy_finish(&cs);
return err;
@@ -682,15 +698,9 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
.commit_id = req->in.h.unique,
};
- err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter);
- if (err) {
- pr_info_ratelimited("fuse: Import of user buffer failed\n");
+ err = setup_fuse_copy_state(&cs, ring, req, ent, ITER_DEST, &iter);
+ if (err)
return err;
- }
-
- fuse_copy_init(&cs, true, &iter);
- cs.is_uring = true;
- cs.req = req;
if (num_args > 0) {
/*
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 18/25] fuse: support buffer copying for kernel addresses
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (16 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 17/25] fuse: refactor setting up copy state for payload copying Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring Joanne Koong
` (6 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
This is a preparatory patch needed to support kernel-managed ring
buffers in fuse-over-io-uring. For kernel-managed ring buffers, we get
the vmapped address of the buffer which we can directly use.
Currently, buffer copying in fuse only supports extracting underlying
pages from an iov iter and kmapping them. This commit allows buffer
copying to work directly on a kaddr.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/dev.c | 23 +++++++++++++++++------
fs/fuse/fuse_dev_i.h | 7 ++++++-
2 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6d59cbc877c6..ceb5d6a553c0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -848,6 +848,9 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write,
/* Unmap and put previous page of userspace buffer */
void fuse_copy_finish(struct fuse_copy_state *cs)
{
+ if (cs->is_kaddr)
+ return;
+
if (cs->currbuf) {
struct pipe_buffer *buf = cs->currbuf;
@@ -873,6 +876,9 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
struct page *page;
int err;
+ if (cs->is_kaddr)
+ return 0;
+
err = unlock_request(cs->req);
if (err)
return err;
@@ -931,15 +937,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
unsigned ncpy = min(*size, cs->len);
if (val) {
- void *pgaddr = kmap_local_page(cs->pg);
- void *buf = pgaddr + cs->offset;
+ void *pgaddr, *buf;
+ if (!cs->is_kaddr) {
+ pgaddr = kmap_local_page(cs->pg);
+ buf = pgaddr + cs->offset;
+ } else {
+ buf = cs->kaddr + cs->offset;
+ }
if (cs->write)
memcpy(buf, *val, ncpy);
else
memcpy(*val, buf, ncpy);
-
- kunmap_local(pgaddr);
+ if (!cs->is_kaddr)
+ kunmap_local(pgaddr);
*val += ncpy;
}
*size -= ncpy;
@@ -1127,7 +1138,7 @@ static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop,
}
while (count) {
- if (cs->write && cs->pipebufs && folio) {
+ if (cs->write && cs->pipebufs && folio && !cs->is_kaddr) {
/*
* Can't control lifetime of pipe buffers, so always
* copy user pages.
@@ -1139,7 +1150,7 @@ static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop,
} else {
return fuse_ref_folio(cs, folio, offset, count);
}
- } else if (!cs->len) {
+ } else if (!cs->len && !cs->is_kaddr) {
if (cs->move_folios && folio &&
offset == 0 && count == size) {
err = fuse_try_move_folio(cs, foliop);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 134bf44aff0d..aa1d25421054 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -28,12 +28,17 @@ struct fuse_copy_state {
struct pipe_buffer *currbuf;
struct pipe_inode_info *pipe;
unsigned long nr_segs;
- struct page *pg;
+ union {
+ struct page *pg;
+ void *kaddr;
+ };
unsigned int len;
unsigned int offset;
bool write:1;
bool move_folios:1;
bool is_uring:1;
+ /* if set, use kaddr; otherwise use pg */
+ bool is_kaddr:1;
struct {
unsigned int copied_sz; /* copied size into the user buffer */
} ring;
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (17 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 18/25] fuse: support buffer copying for kernel addresses Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-20 22:45 ` kernel test robot
` (2 more replies)
2025-12-18 8:33 ` [PATCH v2 20/25] io_uring/rsrc: rename io_buffer_register_bvec()/io_buffer_unregister_bvec() Joanne Koong
` (5 subsequent siblings)
24 siblings, 3 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add io-uring kernel-managed buffer ring capability for fuse daemons
communicating through the io-uring interface.
This has two benefits:
a) eliminates the overhead of pinning/unpinning user pages and
translating virtual addresses for every server-kernel interaction
b) reduces the amount of memory needed for the buffers per queue and
allows buffers to be reused across entries. Incremental buffer
consumption, when added, will allow a buffer to be used across multiple
requests.
Buffer ring usage is set on a per-queue basis. In order to use this, the
daemon needs to have preregistered a kernel-managed buffer ring and a
fixed buffer at index 0 that will hold all the headers, and set the
"use_bufring" field during registration. The kernel-managed buffer ring
will be pinned for the lifetime of the connection.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/dev_uring.c | 422 ++++++++++++++++++++++++++++++++------
fs/fuse/dev_uring_i.h | 30 ++-
include/uapi/linux/fuse.h | 12 +-
3 files changed, 395 insertions(+), 69 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index b57871f92d08..d028cdd57f45 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -10,6 +10,8 @@
#include "fuse_trace.h"
#include <linux/fs.h>
+#include <linux/io_uring.h>
+#include <linux/io_uring/buf.h>
#include <linux/io_uring/cmd.h>
static bool __read_mostly enable_uring;
@@ -19,6 +21,8 @@ MODULE_PARM_DESC(enable_uring,
#define FUSE_URING_IOV_SEGS 2 /* header and payload */
+#define FUSE_URING_RINGBUF_GROUP 0
+#define FUSE_URING_FIXED_HEADERS_OFFSET 0
bool fuse_uring_enabled(void)
{
@@ -276,20 +280,46 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
return res;
}
-static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
- int qid)
+static int fuse_uring_buf_ring_setup(struct io_uring_cmd *cmd,
+ struct fuse_ring_queue *queue,
+ unsigned int issue_flags)
+{
+ int err;
+
+ err = io_uring_cmd_buf_ring_pin(cmd, FUSE_URING_RINGBUF_GROUP,
+ issue_flags, &queue->bufring);
+ if (err)
+ return err;
+
+ if (!io_uring_cmd_is_kmbuf_ring(cmd, FUSE_URING_RINGBUF_GROUP,
+ issue_flags)) {
+ io_uring_cmd_buf_ring_unpin(cmd,
+ FUSE_URING_RINGBUF_GROUP,
+ issue_flags);
+ return -EINVAL;
+ }
+
+ queue->use_bufring = true;
+
+ return 0;
+}
+
+static struct fuse_ring_queue *
+fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
+ int qid, bool use_bufring, unsigned int issue_flags)
{
struct fuse_conn *fc = ring->fc;
struct fuse_ring_queue *queue;
struct list_head *pq;
+ int err;
queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
if (!queue)
- return NULL;
+ return ERR_PTR(-ENOMEM);
pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
if (!pq) {
kfree(queue);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
queue->qid = qid;
@@ -307,6 +337,15 @@ static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
queue->fpq.processing = pq;
fuse_pqueue_init(&queue->fpq);
+ if (use_bufring) {
+ err = fuse_uring_buf_ring_setup(cmd, queue, issue_flags);
+ if (err) {
+ kfree(pq);
+ kfree(queue);
+ return ERR_PTR(err);
+ }
+ }
+
spin_lock(&fc->lock);
if (ring->queues[qid]) {
spin_unlock(&fc->lock);
@@ -584,6 +623,35 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
return err;
}
+static int get_kernel_ring_header(struct fuse_ring_ent *ent,
+ enum fuse_uring_header_type type,
+ struct iov_iter *headers_iter)
+{
+ size_t offset;
+
+ switch (type) {
+ case FUSE_URING_HEADER_IN_OUT:
+ /* No offset - start of header */
+ offset = 0;
+ break;
+ case FUSE_URING_HEADER_OP:
+ offset = offsetof(struct fuse_uring_req_header, op_in);
+ break;
+ case FUSE_URING_HEADER_RING_ENT:
+ offset = offsetof(struct fuse_uring_req_header, ring_ent_in_out);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid header type: %d\n", type);
+ return -EINVAL;
+ }
+
+ *headers_iter = ent->headers_iter;
+ if (offset)
+ iov_iter_advance(headers_iter, offset);
+
+ return 0;
+}
+
static void __user *get_user_ring_header(struct fuse_ring_ent *ent,
enum fuse_uring_header_type type)
{
@@ -605,17 +673,38 @@ static __always_inline int copy_header_to_ring(struct fuse_ring_ent *ent,
const void *header,
size_t header_size)
{
- void __user *ring = get_user_ring_header(ent, type);
+ bool use_bufring = ent->queue->use_bufring;
+ int err = 0;
- if (!ring)
- return -EINVAL;
+ if (use_bufring) {
+ struct iov_iter iter;
+
+ err = get_kernel_ring_header(ent, type, &iter);
+ if (err)
+ goto done;
+
+ if (copy_to_iter(header, header_size, &iter) != header_size)
+ err = -EFAULT;
+ } else {
+ void __user *ring = get_user_ring_header(ent, type);
+
+ if (!ring) {
+ err = -EINVAL;
+ goto done;
+ }
- if (copy_to_user(ring, header, header_size)) {
- pr_info_ratelimited("Copying header to ring failed.\n");
- return -EFAULT;
+ if (copy_to_user(ring, header, header_size))
+ err = -EFAULT;
}
- return 0;
+done:
+ if (err)
+ pr_info_ratelimited("Copying header to ring failed: "
+ "header_type=%u, header_size=%lu, "
+ "use_bufring=%d\n", type, header_size,
+ use_bufring);
+
+ return err;
}
static __always_inline int copy_header_from_ring(struct fuse_ring_ent *ent,
@@ -623,17 +712,38 @@ static __always_inline int copy_header_from_ring(struct fuse_ring_ent *ent,
void *header,
size_t header_size)
{
- const void __user *ring = get_user_ring_header(ent, type);
+ bool use_bufring = ent->queue->use_bufring;
+ int err = 0;
- if (!ring)
- return -EINVAL;
+ if (use_bufring) {
+ struct iov_iter iter;
+
+ err = get_kernel_ring_header(ent, type, &iter);
+ if (err)
+ goto done;
+
+ if (copy_from_iter(header, header_size, &iter) != header_size)
+ err = -EFAULT;
+ } else {
+ const void __user *ring = get_user_ring_header(ent, type);
+
+ if (!ring) {
+ err = -EINVAL;
+ goto done;
+ }
- if (copy_from_user(header, ring, header_size)) {
- pr_info_ratelimited("Copying header from ring failed.\n");
- return -EFAULT;
+ if (copy_from_user(header, ring, header_size))
+ err = -EFAULT;
}
- return 0;
+done:
+ if (err)
+ pr_info_ratelimited("Copying header from ring failed: "
+ "header_type=%u, header_size=%lu, "
+ "use_bufring=%d\n", type, header_size,
+ use_bufring);
+
+ return err;
}
static int setup_fuse_copy_state(struct fuse_copy_state *cs,
@@ -643,14 +753,23 @@ static int setup_fuse_copy_state(struct fuse_copy_state *cs,
{
int err;
- err = import_ubuf(dir, ent->payload, ring->max_payload_sz, iter);
- if (err) {
- pr_info_ratelimited("fuse: Import of user buffer failed\n");
- return err;
+ if (!ent->queue->use_bufring) {
+ err = import_ubuf(dir, ent->payload, ring->max_payload_sz, iter);
+ if (err) {
+ pr_info_ratelimited("fuse: Import of user buffer "
+ "failed\n");
+ return err;
+ }
}
fuse_copy_init(cs, dir == ITER_DEST, iter);
+ if (ent->queue->use_bufring) {
+ cs->is_kaddr = true;
+ cs->len = ent->payload_kvec.iov_len;
+ cs->kaddr = ent->payload_kvec.iov_base;
+ }
+
cs->is_uring = true;
cs->req = req;
@@ -762,6 +881,103 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
sizeof(req->in.h));
}
+static bool fuse_uring_req_has_payload(struct fuse_req *req)
+{
+ struct fuse_args *args = req->args;
+
+ return args->in_numargs > 1 || args->out_numargs;
+}
+
+static int fuse_uring_select_buffer(struct fuse_ring_ent *ent,
+ unsigned int issue_flags)
+ __must_hold(&queue->lock)
+{
+ struct io_br_sel sel;
+ size_t len = 0;
+
+ lockdep_assert_held(&ent->queue->lock);
+
+ /* Get a buffer to use for the payload */
+ sel = io_ring_buffer_select(cmd_to_io_kiocb(ent->cmd), &len,
+ ent->queue->bufring, issue_flags);
+ if (sel.val)
+ return sel.val;
+ if (!sel.kaddr)
+ return -ENOENT;
+
+ ent->payload_kvec.iov_base = sel.kaddr;
+ ent->payload_kvec.iov_len = len;
+ ent->ringbuf_buf_id = sel.buf_id;
+
+ return 0;
+}
+
+static void fuse_uring_clean_up_buffer(struct fuse_ring_ent *ent,
+ unsigned int issue_flags)
+ __must_hold(&queue->lock)
+{
+ struct kvec *kvec = &ent->payload_kvec;
+
+ lockdep_assert_held(&ent->queue->lock);
+
+ if (!ent->queue->use_bufring || !kvec->iov_base)
+ return;
+
+ WARN_ON_ONCE(io_uring_cmd_kmbuffer_recycle(ent->cmd,
+ FUSE_URING_RINGBUF_GROUP,
+ (u64)kvec->iov_base,
+ kvec->iov_len,
+ ent->ringbuf_buf_id,
+ issue_flags));
+
+ memset(kvec, 0, sizeof(*kvec));
+}
+
+static int fuse_uring_next_req_update_buffer(struct fuse_ring_ent *ent,
+ struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ bool buffer_selected;
+ bool has_payload;
+
+ if (!ent->queue->use_bufring)
+ return 0;
+
+ ent->headers_iter.data_source = false;
+
+ buffer_selected = ent->payload_kvec.iov_base != 0;
+ has_payload = fuse_uring_req_has_payload(req);
+
+ if (has_payload && !buffer_selected)
+ return fuse_uring_select_buffer(ent, issue_flags);
+
+ if (!has_payload && buffer_selected)
+ fuse_uring_clean_up_buffer(ent, issue_flags);
+
+ return 0;
+}
+
+static int fuse_uring_prep_buffer(struct fuse_ring_ent *ent,
+ struct fuse_req *req, unsigned int dir,
+ unsigned issue_flags)
+{
+ if (!ent->queue->use_bufring)
+ return 0;
+
+ if (dir == ITER_SOURCE) {
+ ent->headers_iter.data_source = true;
+ return 0;
+ }
+
+ ent->headers_iter.data_source = false;
+
+ /* no payload to copy, can skip selecting a buffer */
+ if (!fuse_uring_req_has_payload(req))
+ return 0;
+
+ return fuse_uring_select_buffer(ent, issue_flags);
+}
+
static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
struct fuse_req *req)
{
@@ -824,21 +1040,29 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
}
/* Fetch the next fuse request if available */
-static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent)
+static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent,
+ unsigned int issue_flags)
__must_hold(&queue->lock)
{
struct fuse_req *req;
struct fuse_ring_queue *queue = ent->queue;
struct list_head *req_queue = &queue->fuse_req_queue;
+ int err;
lockdep_assert_held(&queue->lock);
/* get and assign the next entry while it is still holding the lock */
req = list_first_entry_or_null(req_queue, struct fuse_req, list);
- if (req)
- fuse_uring_add_req_to_ring_ent(ent, req);
+ if (req) {
+ err = fuse_uring_next_req_update_buffer(ent, req, issue_flags);
+ if (!err) {
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ return req;
+ }
+ }
- return req;
+ fuse_uring_clean_up_buffer(ent, issue_flags);
+ return NULL;
}
/*
@@ -878,7 +1102,8 @@ static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
* Else, there is no next fuse request and this returns false.
*/
static bool fuse_uring_get_next_fuse_req(struct fuse_ring_ent *ent,
- struct fuse_ring_queue *queue)
+ struct fuse_ring_queue *queue,
+ unsigned int issue_flags)
{
int err;
struct fuse_req *req;
@@ -886,7 +1111,7 @@ static bool fuse_uring_get_next_fuse_req(struct fuse_ring_ent *ent,
retry:
spin_lock(&queue->lock);
fuse_uring_ent_avail(ent, queue);
- req = fuse_uring_ent_assign_req(ent);
+ req = fuse_uring_ent_assign_req(ent, issue_flags);
spin_unlock(&queue->lock);
if (req) {
@@ -927,6 +1152,38 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
io_uring_cmd_done(cmd, ret, issue_flags);
}
+static void fuse_uring_headers_cleanup(struct fuse_ring_ent *ent,
+ unsigned int issue_flags)
+{
+ if (!ent->queue->use_bufring)
+ return;
+
+ WARN_ON_ONCE(io_uring_cmd_fixed_index_put(ent->cmd,
+ FUSE_URING_FIXED_HEADERS_OFFSET,
+ issue_flags));
+}
+
+static int fuse_uring_headers_prep(struct fuse_ring_ent *ent, unsigned int dir,
+ unsigned int issue_flags)
+{
+ size_t header_size = sizeof(struct fuse_uring_req_header);
+ struct io_uring_cmd *cmd = ent->cmd;
+ unsigned int offset;
+ int err;
+
+ if (!ent->queue->use_bufring)
+ return 0;
+
+ offset = ent->fixed_buf_id * header_size;
+
+ err = io_uring_cmd_fixed_index_get(cmd, FUSE_URING_FIXED_HEADERS_OFFSET,
+ offset, header_size, dir,
+ &ent->headers_iter, issue_flags);
+
+ WARN_ON_ONCE(err);
+ return err;
+}
+
/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
struct fuse_conn *fc)
@@ -940,6 +1197,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
unsigned int qid = READ_ONCE(cmd_req->qid);
struct fuse_pqueue *fpq;
struct fuse_req *req;
+ bool send;
err = -ENOTCONN;
if (!ring)
@@ -990,7 +1248,12 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
/* without the queue lock, as other locks are taken */
fuse_uring_prepare_cancel(cmd, issue_flags, ent);
- fuse_uring_commit(ent, req, issue_flags);
+
+ err = fuse_uring_headers_prep(ent, ITER_SOURCE, issue_flags);
+ if (err)
+ fuse_uring_req_end(ent, req, err);
+ else
+ fuse_uring_commit(ent, req, issue_flags);
/*
* Fetching the next request is absolutely required as queued
@@ -998,7 +1261,9 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
* and fetching is done in one step vs legacy fuse, which has separated
* read (fetch request) and write (commit result).
*/
- if (fuse_uring_get_next_fuse_req(ent, queue))
+ send = fuse_uring_get_next_fuse_req(ent, queue, issue_flags);
+ fuse_uring_headers_cleanup(ent, issue_flags);
+ if (send)
fuse_uring_send(ent, cmd, 0, issue_flags);
return 0;
}
@@ -1094,39 +1359,48 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
struct iovec iov[FUSE_URING_IOV_SEGS];
int err;
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
+ if (!ent)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ent->list);
+
+ ent->queue = queue;
+
+ if (queue->use_bufring) {
+ ent->fixed_buf_id = READ_ONCE(cmd->sqe->buf_index);
+ atomic_inc(&ring->queue_refs);
+ return ent;
+ }
+
err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
if (err) {
pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
err);
- return ERR_PTR(err);
+ goto error;
}
err = -EINVAL;
if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
- return ERR_PTR(err);
+ goto error;
}
payload_size = iov[1].iov_len;
if (payload_size < ring->max_payload_sz) {
pr_info_ratelimited("Invalid req payload len %zu\n",
payload_size);
- return ERR_PTR(err);
+ goto error;
}
-
- err = -ENOMEM;
- ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
- if (!ent)
- return ERR_PTR(err);
-
- INIT_LIST_HEAD(&ent->list);
-
- ent->queue = queue;
ent->headers = iov[0].iov_base;
ent->payload = iov[1].iov_base;
atomic_inc(&ring->queue_refs);
return ent;
+
+error:
+ kfree(ent);
+ return ERR_PTR(err);
}
/*
@@ -1137,6 +1411,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
unsigned int issue_flags, struct fuse_conn *fc)
{
const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ bool use_bufring = READ_ONCE(cmd_req->init.use_bufring);
struct fuse_ring *ring = smp_load_acquire(&fc->ring);
struct fuse_ring_queue *queue;
struct fuse_ring_ent *ent;
@@ -1157,9 +1432,13 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
queue = ring->queues[qid];
if (!queue) {
- queue = fuse_uring_create_queue(ring, qid);
- if (!queue)
- return err;
+ queue = fuse_uring_create_queue(cmd, ring, qid, use_bufring,
+ issue_flags);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+ } else {
+ if (queue->use_bufring != use_bufring)
+ return -EINVAL;
}
/*
@@ -1258,15 +1537,19 @@ static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw)
struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
struct fuse_ring_queue *queue = ent->queue;
+ bool send = true;
int err;
if (!tw.cancel) {
- err = fuse_uring_prepare_send(ent, ent->fuse_req);
- if (err) {
- if (!fuse_uring_get_next_fuse_req(ent, queue))
- return;
- err = 0;
- }
+ if (fuse_uring_headers_prep(ent, ITER_DEST, issue_flags))
+ return;
+
+ if (fuse_uring_prepare_send(ent, ent->fuse_req))
+ send = fuse_uring_get_next_fuse_req(ent, queue, issue_flags);
+ fuse_uring_headers_cleanup(ent, issue_flags);
+ if (!send)
+ return;
+ err = 0;
} else {
err = -ECANCELED;
}
@@ -1325,14 +1608,20 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
req->ring_queue = queue;
ent = list_first_entry_or_null(&queue->ent_avail_queue,
struct fuse_ring_ent, list);
- if (ent)
- fuse_uring_add_req_to_ring_ent(ent, req);
- else
- list_add_tail(&req->list, &queue->fuse_req_queue);
- spin_unlock(&queue->lock);
+ if (ent) {
+ err = fuse_uring_prep_buffer(ent, req, ITER_DEST,
+ IO_URING_F_UNLOCKED);
+ if (!err) {
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ spin_unlock(&queue->lock);
+ fuse_uring_dispatch_ent(ent);
+ return;
+ }
+ WARN_ON_ONCE(err != -ENOENT);
+ }
- if (ent)
- fuse_uring_dispatch_ent(ent);
+ list_add_tail(&req->list, &queue->fuse_req_queue);
+ spin_unlock(&queue->lock);
return;
@@ -1350,6 +1639,7 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
struct fuse_ring *ring = fc->ring;
struct fuse_ring_queue *queue;
struct fuse_ring_ent *ent = NULL;
+ int err;
queue = fuse_uring_task_to_queue(ring);
if (!queue)
@@ -1382,14 +1672,16 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req,
list);
if (ent && req) {
- fuse_uring_add_req_to_ring_ent(ent, req);
- spin_unlock(&queue->lock);
-
- fuse_uring_dispatch_ent(ent);
- } else {
- spin_unlock(&queue->lock);
+ err = fuse_uring_prep_buffer(ent, req, ITER_DEST,
+ IO_URING_F_UNLOCKED);
+ if (!err) {
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ spin_unlock(&queue->lock);
+ fuse_uring_dispatch_ent(ent);
+ return true;
+ }
}
-
+ spin_unlock(&queue->lock);
return true;
}
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 51a563922ce1..eff14557066d 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -7,6 +7,8 @@
#ifndef _FS_FUSE_DEV_URING_I_H
#define _FS_FUSE_DEV_URING_I_H
+#include <linux/uio.h>
+
#include "fuse_i.h"
#ifdef CONFIG_FUSE_IO_URING
@@ -38,9 +40,25 @@ enum fuse_ring_req_state {
/** A fuse ring entry, part of the ring queue */
struct fuse_ring_ent {
- /* userspace buffer */
- struct fuse_uring_req_header __user *headers;
- void __user *payload;
+ union {
+ /* queue->use_bufring == false */
+ struct {
+ /* userspace buffers */
+ struct fuse_uring_req_header __user *headers;
+ void __user *payload;
+ };
+ /* queue->use_bufring == true */
+ struct {
+ struct iov_iter headers_iter;
+ struct kvec payload_kvec;
+ /*
+ * This needs to be tracked in order to properly recycle
+ * the buffer when done with it
+ */
+ unsigned int ringbuf_buf_id;
+ unsigned int fixed_buf_id;
+ };
+ };
/* the ring queue that owns the request */
struct fuse_ring_queue *queue;
@@ -99,6 +117,12 @@ struct fuse_ring_queue {
unsigned int active_background;
bool stopped;
+
+ /* true if kernel-managed buffer ring is used */
+ bool use_bufring: 1;
+
+ /* synchronized by the queue lock */
+ struct io_buffer_list *bufring;
};
/**
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index c13e1f9a2f12..3041177e3dd8 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -240,6 +240,9 @@
* - add FUSE_COPY_FILE_RANGE_64
* - add struct fuse_copy_file_range_out
* - add FUSE_NOTIFY_PRUNE
+ *
+ * 7.46
+ * - add fuse_uring_cmd_req use_bufring
*/
#ifndef _LINUX_FUSE_H
@@ -1305,7 +1308,14 @@ struct fuse_uring_cmd_req {
/* queue the command is for (queue index) */
uint16_t qid;
- uint8_t padding[6];
+
+ union {
+ struct {
+ bool use_bufring;
+ } init;
+ };
+
+ uint8_t padding[5];
};
#endif /* _LINUX_FUSE_H */
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
2025-12-18 8:33 ` [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring Joanne Koong
@ 2025-12-20 22:45 ` kernel test robot
2025-12-21 2:10 ` kernel test robot
2025-12-22 17:23 ` kernel test robot
2 siblings, 0 replies; 33+ messages in thread
From: kernel test robot @ 2025-12-20 22:45 UTC (permalink / raw)
To: Joanne Koong, miklos, axboe
Cc: oe-kbuild-all, bschubert, asml.silence, io-uring, csander,
xiaobing.li, linux-fsdevel
Hi Joanne,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe/for-next]
[also build test WARNING on linus/master v6.19-rc1 next-20251219]
[cannot apply to mszeredi-fuse/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/io_uring-kbuf-refactor-io_buf_pbuf_register-logic-into-generic-helpers/20251218-165107
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20251218083319.3485503-20-joannelkoong%40gmail.com
patch subject: [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
config: i386-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20251220/202512202342.AGVIgnBx-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251220/202512202342.AGVIgnBx-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512202342.AGVIgnBx-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> fs/fuse/dev_uring.c:704:35: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
703 | "header_type=%u, header_size=%lu, "
| ~~~
| %zu
704 | "use_bufring=%d\n", type, header_size,
| ^~~~~~~~~~~
./include/linux/printk.h:726:46: note: expanded from macro 'pr_info_ratelimited'
726 | printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
./include/linux/printk.h:706:17: note: expanded from macro 'printk_ratelimited'
706 | printk(fmt, ##__VA_ARGS__); \
| ~~~ ^~~~~~~~~~~
./include/linux/printk.h:512:60: note: expanded from macro 'printk'
512 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
./include/linux/printk.h:484:19: note: expanded from macro 'printk_index_wrap'
484 | _p_func(_fmt, ##__VA_ARGS__); \
| ~~~~ ^~~~~~~~~~~
fs/fuse/dev_uring.c:743:35: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
742 | "header_type=%u, header_size=%lu, "
| ~~~
| %zu
743 | "use_bufring=%d\n", type, header_size,
| ^~~~~~~~~~~
./include/linux/printk.h:726:46: note: expanded from macro 'pr_info_ratelimited'
726 | printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
./include/linux/printk.h:706:17: note: expanded from macro 'printk_ratelimited'
706 | printk(fmt, ##__VA_ARGS__); \
| ~~~ ^~~~~~~~~~~
./include/linux/printk.h:512:60: note: expanded from macro 'printk'
512 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
./include/linux/printk.h:484:19: note: expanded from macro 'printk_index_wrap'
484 | _p_func(_fmt, ##__VA_ARGS__); \
| ~~~~ ^~~~~~~~~~~
2 warnings generated.
vim +704 fs/fuse/dev_uring.c
670
671 static __always_inline int copy_header_to_ring(struct fuse_ring_ent *ent,
672 enum fuse_uring_header_type type,
673 const void *header,
674 size_t header_size)
675 {
676 bool use_bufring = ent->queue->use_bufring;
677 int err = 0;
678
679 if (use_bufring) {
680 struct iov_iter iter;
681
682 err = get_kernel_ring_header(ent, type, &iter);
683 if (err)
684 goto done;
685
686 if (copy_to_iter(header, header_size, &iter) != header_size)
687 err = -EFAULT;
688 } else {
689 void __user *ring = get_user_ring_header(ent, type);
690
691 if (!ring) {
692 err = -EINVAL;
693 goto done;
694 }
695
696 if (copy_to_user(ring, header, header_size))
697 err = -EFAULT;
698 }
699
700 done:
701 if (err)
702 pr_info_ratelimited("Copying header to ring failed: "
703 "header_type=%u, header_size=%lu, "
> 704 "use_bufring=%d\n", type, header_size,
705 use_bufring);
706
707 return err;
708 }
709
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
2025-12-18 8:33 ` [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring Joanne Koong
2025-12-20 22:45 ` kernel test robot
@ 2025-12-21 2:10 ` kernel test robot
2025-12-22 17:23 ` kernel test robot
2 siblings, 0 replies; 33+ messages in thread
From: kernel test robot @ 2025-12-21 2:10 UTC (permalink / raw)
To: Joanne Koong, miklos, axboe
Cc: oe-kbuild-all, bschubert, asml.silence, io-uring, csander,
xiaobing.li, linux-fsdevel
Hi Joanne,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe/for-next]
[also build test ERROR on linus/master v6.19-rc1 next-20251219]
[cannot apply to mszeredi-fuse/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/io_uring-kbuf-refactor-io_buf_pbuf_register-logic-into-generic-helpers/20251218-165107
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20251218083319.3485503-20-joannelkoong%40gmail.com
patch subject: [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251221/202512210325.13rE0qzj-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251221/202512210325.13rE0qzj-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512210325.13rE0qzj-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from <command-line>:
>> ./usr/include/linux/fuse.h:1310:25: error: unknown type name 'bool'
1310 | bool use_bufring;
| ^~~~
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
2025-12-18 8:33 ` [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring Joanne Koong
2025-12-20 22:45 ` kernel test robot
2025-12-21 2:10 ` kernel test robot
@ 2025-12-22 17:23 ` kernel test robot
2 siblings, 0 replies; 33+ messages in thread
From: kernel test robot @ 2025-12-22 17:23 UTC (permalink / raw)
To: Joanne Koong, miklos, axboe
Cc: llvm, oe-kbuild-all, bschubert, asml.silence, io-uring, csander,
xiaobing.li, linux-fsdevel
Hi Joanne,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe/for-next]
[also build test WARNING on linus/master v6.19-rc2 next-20251219]
[cannot apply to mszeredi-fuse/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/io_uring-kbuf-refactor-io_buf_pbuf_register-logic-into-generic-helpers/20251218-165107
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20251218083319.3485503-20-joannelkoong%40gmail.com
patch subject: [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring
config: um-randconfig-001-20251222 (https://download.01.org/0day-ci/archive/20251223/202512230043.PJcZViVh-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 185f5fd5ce4c65116ca8cf6df467a682ef090499)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251223/202512230043.PJcZViVh-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512230043.PJcZViVh-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from fs/fuse/dev_uring.c:7:
In file included from fs/fuse/fuse_i.h:23:
In file included from include/linux/backing-dev.h:16:
In file included from include/linux/writeback.h:13:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from arch/um/include/asm/hardirq.h:24:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:12:
In file included from arch/um/include/asm/io.h:24:
include/asm-generic/io.h:1209:55: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
1209 | return (port > MMIO_UPPER_LIMIT) ? NULL : PCI_IOBASE + port;
| ~~~~~~~~~~ ^
>> fs/fuse/dev_uring.c:704:35: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
703 | "header_type=%u, header_size=%lu, "
| ~~~
| %zu
704 | "use_bufring=%d\n", type, header_size,
| ^~~~~~~~~~~
include/linux/printk.h:726:46: note: expanded from macro 'pr_info_ratelimited'
726 | printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
include/linux/printk.h:706:17: note: expanded from macro 'printk_ratelimited'
706 | printk(fmt, ##__VA_ARGS__); \
| ~~~ ^~~~~~~~~~~
include/linux/printk.h:512:60: note: expanded from macro 'printk'
512 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
include/linux/printk.h:484:19: note: expanded from macro 'printk_index_wrap'
484 | _p_func(_fmt, ##__VA_ARGS__); \
| ~~~~ ^~~~~~~~~~~
fs/fuse/dev_uring.c:743:35: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
742 | "header_type=%u, header_size=%lu, "
| ~~~
| %zu
743 | "use_bufring=%d\n", type, header_size,
| ^~~~~~~~~~~
include/linux/printk.h:726:46: note: expanded from macro 'pr_info_ratelimited'
726 | printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
include/linux/printk.h:706:17: note: expanded from macro 'printk_ratelimited'
706 | printk(fmt, ##__VA_ARGS__); \
| ~~~ ^~~~~~~~~~~
include/linux/printk.h:512:60: note: expanded from macro 'printk'
512 | #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
| ~~~ ^~~~~~~~~~~
include/linux/printk.h:484:19: note: expanded from macro 'printk_index_wrap'
484 | _p_func(_fmt, ##__VA_ARGS__); \
| ~~~~ ^~~~~~~~~~~
3 warnings generated.
vim +704 fs/fuse/dev_uring.c
670
671 static __always_inline int copy_header_to_ring(struct fuse_ring_ent *ent,
672 enum fuse_uring_header_type type,
673 const void *header,
674 size_t header_size)
675 {
676 bool use_bufring = ent->queue->use_bufring;
677 int err = 0;
678
679 if (use_bufring) {
680 struct iov_iter iter;
681
682 err = get_kernel_ring_header(ent, type, &iter);
683 if (err)
684 goto done;
685
686 if (copy_to_iter(header, header_size, &iter) != header_size)
687 err = -EFAULT;
688 } else {
689 void __user *ring = get_user_ring_header(ent, type);
690
691 if (!ring) {
692 err = -EINVAL;
693 goto done;
694 }
695
696 if (copy_to_user(ring, header, header_size))
697 err = -EFAULT;
698 }
699
700 done:
701 if (err)
702 pr_info_ratelimited("Copying header to ring failed: "
703 "header_type=%u, header_size=%lu, "
> 704 "use_bufring=%d\n", type, header_size,
705 use_bufring);
706
707 return err;
708 }
709
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH v2 20/25] io_uring/rsrc: rename io_buffer_register_bvec()/io_buffer_unregister_bvec()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (18 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 19/25] fuse: add io-uring kernel-managed buffer ring Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 21/25] io_uring/rsrc: split io_buffer_register_request() logic Joanne Koong
` (4 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Currently, io_buffer_register_bvec() takes in a request. In preparation
for supporting kernel-populated buffers in fuse io-uring (which will
need to register bvecs directly, not through a struct request), rename
this to io_buffer_register_request().
A subsequent patch will commandeer the "io_buffer_register_bvec()"
function name to support registering bvecs directly.
Rename io_buffer_unregister_bvec() to a more generic name,
io_buffer_unregister(), as both io_buffer_register_request() and
io_buffer_register_bvec() callers will use it for unregistration.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
Documentation/block/ublk.rst | 14 +++++++-------
drivers/block/ublk_drv.c | 18 +++++++++---------
include/linux/io_uring/cmd.h | 26 ++++++++++++++++++++------
io_uring/rsrc.c | 14 +++++++-------
4 files changed, 43 insertions(+), 29 deletions(-)
diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst
index 8c4030bcabb6..aa6e0bf9405b 100644
--- a/Documentation/block/ublk.rst
+++ b/Documentation/block/ublk.rst
@@ -326,17 +326,17 @@ Zero copy
---------
ublk zero copy relies on io_uring's fixed kernel buffer, which provides
-two APIs: `io_buffer_register_bvec()` and `io_buffer_unregister_bvec`.
+two APIs: `io_buffer_register_request()` and `io_buffer_unregister`.
ublk adds IO command of `UBLK_IO_REGISTER_IO_BUF` to call
-`io_buffer_register_bvec()` for ublk server to register client request
+`io_buffer_register_request()` for ublk server to register client request
buffer into io_uring buffer table, then ublk server can submit io_uring
IOs with the registered buffer index. IO command of `UBLK_IO_UNREGISTER_IO_BUF`
-calls `io_buffer_unregister_bvec()` to unregister the buffer, which is
-guaranteed to be live between calling `io_buffer_register_bvec()` and
-`io_buffer_unregister_bvec()`. Any io_uring operation which supports this
-kind of kernel buffer will grab one reference of the buffer until the
-operation is completed.
+calls `io_buffer_unregister()` to unregister the buffer, which is guaranteed
+to be live between calling `io_buffer_register_request()` and
+`io_buffer_unregister()`. Any io_uring operation which supports this kind of
+kernel buffer will grab one reference of the buffer until the operation is
+completed.
ublk server implementing zero copy or user copy has to be CAP_SYS_ADMIN and
be trusted, because it is ublk server's responsibility to make sure IO buffer
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index df9831783a13..0a42f6a75b62 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1202,8 +1202,8 @@ __ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
{
int ret;
- ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
- io->buf.auto_reg.index, issue_flags);
+ ret = io_buffer_register_request(cmd, req, ublk_io_release,
+ io->buf.auto_reg.index, issue_flags);
if (ret) {
if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
ublk_auto_buf_reg_fallback(ubq, req->tag);
@@ -2166,8 +2166,8 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
if (!req)
return -EINVAL;
- ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
- issue_flags);
+ ret = io_buffer_register_request(cmd, req, ublk_io_release, index,
+ issue_flags);
if (ret) {
ublk_put_req_ref(io, req);
return ret;
@@ -2198,8 +2198,8 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
return -EINVAL;
- ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
- issue_flags);
+ ret = io_buffer_register_request(cmd, req, ublk_io_release, index,
+ issue_flags);
if (ret)
return ret;
@@ -2214,7 +2214,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
return -EINVAL;
- return io_buffer_unregister_bvec(cmd, index, issue_flags);
+ return io_buffer_unregister(cmd, index, issue_flags);
}
static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
@@ -2350,7 +2350,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
goto out;
/*
- * io_buffer_unregister_bvec() doesn't access the ubq or io,
+ * io_buffer_unregister() doesn't access the ubq or io,
* so no need to validate the q_id, tag, or task
*/
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
@@ -2420,7 +2420,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
/* can't touch 'ublk_io' any more */
if (buf_idx != UBLK_INVALID_BUF_IDX)
- io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
+ io_buffer_unregister(cmd, buf_idx, issue_flags);
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = addr;
if (compl)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 61c4ca863ef6..06e4cfadb344 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -102,6 +102,12 @@ int io_uring_cmd_kmbuffer_recycle(struct io_uring_cmd *cmd,
int io_uring_cmd_is_kmbuf_ring(struct io_uring_cmd *ioucmd,
unsigned int buf_group, unsigned int issue_flags);
+
+int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
+ void (*release)(void *), unsigned int index,
+ unsigned int issue_flags);
+int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
+ unsigned int issue_flags);
#else
static inline int
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -185,6 +191,20 @@ static inline int io_uring_cmd_is_kmbuf_ring(struct io_uring_cmd *ioucmd,
{
return -EOPNOTSUPP;
}
+static inline int io_buffer_register_request(struct io_uring_cmd *cmd,
+ struct request *rq,
+ void (*release)(void *),
+ unsigned int index,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
+static inline int io_buffer_unregister(struct io_uring_cmd *cmd,
+ unsigned int index,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
#endif
static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req)
@@ -234,10 +254,4 @@ static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret,
return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true);
}
-int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
- void (*release)(void *), unsigned int index,
- unsigned int issue_flags);
-int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
- unsigned int issue_flags);
-
#endif /* _LINUX_IO_URING_CMD_H */
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a141aaeb099d..b25b418e5c11 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -936,9 +936,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
- void (*release)(void *), unsigned int index,
- unsigned int issue_flags)
+int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
+ void (*release)(void *), unsigned int index,
+ unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
@@ -998,10 +998,10 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
-EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
+EXPORT_SYMBOL_GPL(io_buffer_register_request);
-int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
- unsigned int issue_flags)
+int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
+ unsigned int issue_flags)
{
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
@@ -1031,7 +1031,7 @@ int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
-EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
+EXPORT_SYMBOL_GPL(io_buffer_unregister);
static int validate_fixed_range(u64 buf_addr, size_t len,
const struct io_mapped_ubuf *imu)
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 21/25] io_uring/rsrc: split io_buffer_register_request() logic
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (19 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 20/25] io_uring/rsrc: rename io_buffer_register_bvec()/io_buffer_unregister_bvec() Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 22/25] io_uring/rsrc: Allow buffer release callback to be optional Joanne Koong
` (3 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Split the main initialization logic in io_buffer_register_request() into
a helper function.
This is a preparatory patch for supporting kernel-populated buffers in
fuse io-uring, which will be reusing this logic.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
io_uring/rsrc.c | 89 ++++++++++++++++++++++++++++++-------------------
1 file changed, 54 insertions(+), 35 deletions(-)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index b25b418e5c11..5fe2695dafb6 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -936,67 +936,86 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
- void (*release)(void *), unsigned int index,
- unsigned int issue_flags)
+static struct io_mapped_ubuf *io_kernel_buffer_init(struct io_ring_ctx *ctx,
+ unsigned int nr_bvecs,
+ unsigned int total_bytes,
+ u8 dir,
+ void (*release)(void *),
+ void *priv,
+ unsigned int index)
{
- struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
struct io_rsrc_data *data = &ctx->buf_table;
- struct req_iterator rq_iter;
struct io_mapped_ubuf *imu;
struct io_rsrc_node *node;
- struct bio_vec bv;
- unsigned int nr_bvecs = 0;
- int ret = 0;
- io_ring_submit_lock(ctx, issue_flags);
- if (index >= data->nr) {
- ret = -EINVAL;
- goto unlock;
- }
+ if (index >= data->nr)
+ return ERR_PTR(-EINVAL);
index = array_index_nospec(index, data->nr);
- if (data->nodes[index]) {
- ret = -EBUSY;
- goto unlock;
- }
+ if (data->nodes[index])
+ return ERR_PTR(-EBUSY);
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
- if (!node) {
- ret = -ENOMEM;
- goto unlock;
- }
+ if (!node)
+ return ERR_PTR(-ENOMEM);
- /*
- * blk_rq_nr_phys_segments() may overestimate the number of bvecs
- * but avoids needing to iterate over the bvecs
- */
- imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
+ imu = io_alloc_imu(ctx, nr_bvecs);
if (!imu) {
kfree(node);
- ret = -ENOMEM;
- goto unlock;
+ return ERR_PTR(-ENOMEM);
}
imu->ubuf = 0;
- imu->len = blk_rq_bytes(rq);
+ imu->len = total_bytes;
imu->acct_pages = 0;
imu->folio_shift = PAGE_SHIFT;
+ imu->nr_bvecs = nr_bvecs;
refcount_set(&imu->refs, 1);
imu->release = release;
- imu->priv = rq;
+ imu->priv = priv;
imu->is_kbuf = true;
- imu->dir = 1 << rq_data_dir(rq);
+ imu->dir = 1 << dir;
+ node->buf = imu;
+ data->nodes[index] = node;
+
+ return imu;
+}
+
+int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
+ void (*release)(void *), unsigned int index,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct req_iterator rq_iter;
+ struct io_mapped_ubuf *imu;
+ struct bio_vec bv;
+ unsigned int nr_bvecs;
+ unsigned int total_bytes;
+
+ /*
+ * blk_rq_nr_phys_segments() may overestimate the number of bvecs
+ * but avoids needing to iterate over the bvecs
+ */
+ nr_bvecs = blk_rq_nr_phys_segments(rq);
+ total_bytes = blk_rq_bytes(rq);
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, rq_data_dir(rq),
+ release, rq, index);
+ if (IS_ERR(imu)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return PTR_ERR(imu);
+ }
+
+ nr_bvecs = 0;
rq_for_each_bvec(bv, rq, rq_iter)
imu->bvec[nr_bvecs++] = bv;
imu->nr_bvecs = nr_bvecs;
- node->buf = imu;
- data->nodes[index] = node;
-unlock:
io_ring_submit_unlock(ctx, issue_flags);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(io_buffer_register_request);
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 22/25] io_uring/rsrc: Allow buffer release callback to be optional
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (20 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 21/25] io_uring/rsrc: split io_buffer_register_request() logic Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 23/25] io_uring/rsrc: add io_buffer_register_bvec() Joanne Koong
` (2 subsequent siblings)
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
This is a preparatory patch for supporting kernel-populated buffers in
fuse io-uring, which does not need a release callback.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
---
io_uring/rsrc.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5fe2695dafb6..5a708cecba4a 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,7 +148,8 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
if (imu->acct_pages)
io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
- imu->release(imu->priv);
+ if (imu->release)
+ imu->release(imu->priv);
io_free_imu(ctx, imu);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 23/25] io_uring/rsrc: add io_buffer_register_bvec()
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (21 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 22/25] io_uring/rsrc: Allow buffer release callback to be optional Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 24/25] fuse: add zero-copy over io-uring Joanne Koong
2025-12-18 8:33 ` [PATCH v2 25/25] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add io_buffer_register_bvec() for registering a bvec array.
This is a preparatory patch for fuse-over-io-uring zero-copy.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/io_uring/cmd.h | 12 ++++++++++++
io_uring/rsrc.c | 27 +++++++++++++++++++++++++++
2 files changed, 39 insertions(+)
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 06e4cfadb344..f5094eb1206a 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -106,6 +106,9 @@ int io_uring_cmd_is_kmbuf_ring(struct io_uring_cmd *ioucmd,
int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
+int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct bio_vec *bvs,
+ unsigned int nr_bvecs, unsigned int total_bytes,
+ u8 dir, unsigned int index, unsigned int issue_flags);
int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags);
#else
@@ -199,6 +202,15 @@ static inline int io_buffer_register_request(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
+static inline int io_buffer_register_bvec(struct io_uring_cmd *cmd,
+ struct bio_vec *bvs,
+ unsigned int nr_bvecs,
+ unsigned int total_bytes, u8 dir,
+ unsigned int index,
+ unsigned int issue_flags)
+{
+ return -EOPNOTSUPP;
+}
static inline int io_buffer_unregister(struct io_uring_cmd *cmd,
unsigned int index,
unsigned int issue_flags)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5a708cecba4a..32126c06f4c9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1020,6 +1020,33 @@ int io_buffer_register_request(struct io_uring_cmd *cmd, struct request *rq,
}
EXPORT_SYMBOL_GPL(io_buffer_register_request);
+int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct bio_vec *bvs,
+ unsigned int nr_bvecs, unsigned int total_bytes,
+ u8 dir, unsigned int index,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct io_mapped_ubuf *imu;
+ struct bio_vec *bvec;
+ int i;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ imu = io_kernel_buffer_init(ctx, nr_bvecs, total_bytes, dir, NULL,
+ NULL, index);
+ if (IS_ERR(imu)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return PTR_ERR(imu);
+ }
+
+ bvec = imu->bvec;
+ for (i = 0; i < nr_bvecs; i++)
+ bvec[i] = bvs[i];
+
+ io_ring_submit_unlock(ctx, issue_flags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
+
int io_buffer_unregister(struct io_uring_cmd *cmd, unsigned int index,
unsigned int issue_flags)
{
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 24/25] fuse: add zero-copy over io-uring
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (22 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 23/25] io_uring/rsrc: add io_buffer_register_bvec() Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-18 8:33 ` [PATCH v2 25/25] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
24 siblings, 0 replies; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Implement zero-copy data transfer for fuse over io-uring, eliminating
memory copies between kernel and userspace for read/write operations.
This is only allowed on privileged servers and requires the server to
preregister the following:
a) a sparse buffer corresponding to the queue depth
b) a fixed buffer at index queue_depth (the tail of the buffers)
c) a kernel-managed buffer ring
The sparse buffer is where the client's pages reside. The fixed buffer
at the tail is where the headers (struct fuse_uring_req_header) are
placed. The kernel-managed buffer ring is where any non-zero-copied args
reside (eg out headers).
Benchmarks with bs=1M showed approximately the following differences in
throughput:
direct randreads: ~20% increase (~2100 MB/s -> ~2600 MB/s)
buffered randreads: ~25% increase (~1900 MB/s -> 2400 MB/s)
direct randwrites: no difference (~750 MB/s)
buffered randwrites: ~10% increase (950 MB/s -> 1050 MB/s)
The benchmark was run using fio on the passthrough_hp server:
fio --name=test_run --ioengine=sync --rw=rand{read,write} --bs=1M
--size=1G --numjobs=2 --ramp_time=30 --group_reporting=1
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/dev.c | 7 +-
fs/fuse/dev_uring.c | 176 +++++++++++++++++++++++++++++++-------
fs/fuse/dev_uring_i.h | 11 +++
fs/fuse/fuse_dev_i.h | 1 +
include/uapi/linux/fuse.h | 6 +-
5 files changed, 164 insertions(+), 37 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ceb5d6a553c0..0f7f2d8b3951 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1229,8 +1229,11 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
for (i = 0; !err && i < numargs; i++) {
struct fuse_arg *arg = &args[i];
- if (i == numargs - 1 && argpages)
- err = fuse_copy_folios(cs, arg->size, zeroing);
+ if (i == numargs - 1 && argpages) {
+ if (cs->skip_folio_copy)
+ return 0;
+ return fuse_copy_folios(cs, arg->size, zeroing);
+ }
else
err = fuse_copy_one(cs, arg->value, arg->size);
}
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index d028cdd57f45..ff8531558785 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -89,8 +89,14 @@ static void fuse_uring_flush_bg(struct fuse_ring_queue *queue)
}
}
+static bool can_zero_copy_req(struct fuse_ring_ent *ent, struct fuse_req *req)
+{
+ return ent->queue->use_zero_copy &&
+ (req->args->in_pages || req->args->out_pages);
+}
+
static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
- int error)
+ int error, unsigned issue_flags)
{
struct fuse_ring_queue *queue = ent->queue;
struct fuse_ring *ring = queue->ring;
@@ -109,6 +115,12 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
spin_unlock(&queue->lock);
+ if (ent->zero_copied) {
+ WARN_ON_ONCE(io_buffer_unregister(ent->cmd, ent->fixed_buf_id,
+ issue_flags));
+ ent->zero_copied = false;
+ }
+
if (error)
req->out.h.error = error;
@@ -282,6 +294,7 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
static int fuse_uring_buf_ring_setup(struct io_uring_cmd *cmd,
struct fuse_ring_queue *queue,
+ bool zero_copy,
unsigned int issue_flags)
{
int err;
@@ -291,22 +304,39 @@ static int fuse_uring_buf_ring_setup(struct io_uring_cmd *cmd,
if (err)
return err;
+ err = -EINVAL;
+
if (!io_uring_cmd_is_kmbuf_ring(cmd, FUSE_URING_RINGBUF_GROUP,
- issue_flags)) {
- io_uring_cmd_buf_ring_unpin(cmd,
- FUSE_URING_RINGBUF_GROUP,
- issue_flags);
- return -EINVAL;
+ issue_flags))
+ goto error;
+
+ if (zero_copy) {
+ const struct fuse_uring_cmd_req *cmd_req =
+ io_uring_sqe_cmd(cmd->sqe);
+
+ if (!capable(CAP_SYS_ADMIN))
+ goto error;
+
+ queue->use_zero_copy = true;
+ queue->zero_copy_depth = READ_ONCE(cmd_req->init.queue_depth);
+ if (!queue->zero_copy_depth)
+ goto error;
}
queue->use_bufring = true;
return 0;
+
+error:
+ io_uring_cmd_buf_ring_unpin(cmd, FUSE_URING_RINGBUF_GROUP,
+ issue_flags);
+ return err;
}
static struct fuse_ring_queue *
fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
- int qid, bool use_bufring, unsigned int issue_flags)
+ int qid, bool use_bufring, bool zero_copy,
+ unsigned int issue_flags)
{
struct fuse_conn *fc = ring->fc;
struct fuse_ring_queue *queue;
@@ -338,12 +368,13 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
fuse_pqueue_init(&queue->fpq);
if (use_bufring) {
- err = fuse_uring_buf_ring_setup(cmd, queue, issue_flags);
- if (err) {
- kfree(pq);
- kfree(queue);
- return ERR_PTR(err);
- }
+ err = fuse_uring_buf_ring_setup(cmd, queue, zero_copy,
+ issue_flags);
+ if (err)
+ goto cleanup;
+ } else if (zero_copy) {
+ err = -EINVAL;
+ goto cleanup;
}
spin_lock(&fc->lock);
@@ -361,6 +392,11 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
spin_unlock(&fc->lock);
return queue;
+
+cleanup:
+ kfree(pq);
+ kfree(queue);
+ return ERR_PTR(err);
}
static void fuse_uring_stop_fuse_req_end(struct fuse_req *req)
@@ -768,6 +804,7 @@ static int setup_fuse_copy_state(struct fuse_copy_state *cs,
cs->is_kaddr = true;
cs->len = ent->payload_kvec.iov_len;
cs->kaddr = ent->payload_kvec.iov_base;
+ cs->skip_folio_copy = can_zero_copy_req(ent, req);
}
cs->is_uring = true;
@@ -800,11 +837,53 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
return err;
}
+static int fuse_uring_set_up_zero_copy(struct fuse_ring_ent *ent,
+ struct fuse_req *req,
+ unsigned issue_flags)
+{
+ struct fuse_args_pages *ap;
+ size_t total_bytes = 0;
+ struct bio_vec *bvs;
+ int err, ddir, i;
+
+ /* out_pages indicates a read, in_pages indicates a write */
+ ddir = req->args->out_pages ? ITER_DEST : ITER_SOURCE;
+
+ ap = container_of(req->args, typeof(*ap), args);
+
+ /*
+ * We can avoid having to allocate the bvs array when folios and
+ * descriptors are internally represented by bvs in fuse
+ */
+ bvs = kcalloc(ap->num_folios, sizeof(*bvs), GFP_KERNEL_ACCOUNT);
+ if (!bvs)
+ return -ENOMEM;
+
+ for (i = 0; i < ap->num_folios; i++) {
+ total_bytes += ap->descs[i].length;
+ bvs[i].bv_page = folio_page(ap->folios[i], 0);
+ bvs[i].bv_offset = ap->descs[i].offset;
+ bvs[i].bv_len = ap->descs[i].length;
+ }
+
+ err = io_buffer_register_bvec(ent->cmd, bvs, ap->num_folios,
+ total_bytes, ddir, ent->fixed_buf_id,
+ issue_flags);
+ kfree(bvs);
+ if (err)
+ return err;
+
+ ent->zero_copied = true;
+
+ return 0;
+}
+
/*
* Copy data from the req to the ring buffer
*/
static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
- struct fuse_ring_ent *ent)
+ struct fuse_ring_ent *ent,
+ unsigned int issue_flags)
{
struct fuse_copy_state cs;
struct fuse_args *args = req->args;
@@ -837,6 +916,11 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
num_args--;
}
+ if (can_zero_copy_req(ent, req)) {
+ err = fuse_uring_set_up_zero_copy(ent, req, issue_flags);
+ if (err)
+ return err;
+ }
/* copy the payload */
err = fuse_copy_args(&cs, num_args, args->in_pages,
(struct fuse_arg *)in_args, 0);
@@ -847,12 +931,17 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
}
ent_in_out.payload_sz = cs.ring.copied_sz;
+ if (cs.skip_folio_copy && args->in_pages)
+ ent_in_out.payload_sz +=
+ args->in_args[args->in_numargs - 1].size;
+
return copy_header_to_ring(ent, FUSE_URING_HEADER_RING_ENT,
&ent_in_out, sizeof(ent_in_out));
}
static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
- struct fuse_req *req)
+ struct fuse_req *req,
+ unsigned int issue_flags)
{
struct fuse_ring_queue *queue = ent->queue;
struct fuse_ring *ring = queue->ring;
@@ -870,7 +959,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
return err;
/* copy the request */
- err = fuse_uring_args_to_ring(ring, req, ent);
+ err = fuse_uring_args_to_ring(ring, req, ent, issue_flags);
if (unlikely(err)) {
pr_info_ratelimited("Copy to ring failed: %d\n", err);
return err;
@@ -881,11 +970,20 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
sizeof(req->in.h));
}
-static bool fuse_uring_req_has_payload(struct fuse_req *req)
+static bool fuse_uring_req_has_copyable_payload(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
{
struct fuse_args *args = req->args;
- return args->in_numargs > 1 || args->out_numargs;
+ if (!can_zero_copy_req(ent, req))
+ return args->in_numargs > 1 || args->out_numargs;
+
+ if ((args->in_numargs > 1) && (!args->in_pages || args->in_numargs > 2))
+ return true;
+ if (args->out_numargs && (!args->out_pages || args->out_numargs > 1))
+ return true;
+
+ return false;
}
static int fuse_uring_select_buffer(struct fuse_ring_ent *ent,
@@ -946,7 +1044,7 @@ static int fuse_uring_next_req_update_buffer(struct fuse_ring_ent *ent,
ent->headers_iter.data_source = false;
buffer_selected = ent->payload_kvec.iov_base != 0;
- has_payload = fuse_uring_req_has_payload(req);
+ has_payload = fuse_uring_req_has_copyable_payload(ent, req);
if (has_payload && !buffer_selected)
return fuse_uring_select_buffer(ent, issue_flags);
@@ -972,22 +1070,23 @@ static int fuse_uring_prep_buffer(struct fuse_ring_ent *ent,
ent->headers_iter.data_source = false;
/* no payload to copy, can skip selecting a buffer */
- if (!fuse_uring_req_has_payload(req))
+ if (!fuse_uring_req_has_copyable_payload(ent, req))
return 0;
return fuse_uring_select_buffer(ent, issue_flags);
}
static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
- struct fuse_req *req)
+ struct fuse_req *req,
+ unsigned int issue_flags)
{
int err;
- err = fuse_uring_copy_to_ring(ent, req);
+ err = fuse_uring_copy_to_ring(ent, req, issue_flags);
if (!err)
set_bit(FR_SENT, &req->flags);
else
- fuse_uring_req_end(ent, req, err);
+ fuse_uring_req_end(ent, req, err, issue_flags);
return err;
}
@@ -1092,7 +1191,7 @@ static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
err = fuse_uring_copy_from_ring(ring, req, ent);
out:
- fuse_uring_req_end(ent, req, err);
+ fuse_uring_req_end(ent, req, err, issue_flags);
}
/*
@@ -1115,7 +1214,7 @@ static bool fuse_uring_get_next_fuse_req(struct fuse_ring_ent *ent,
spin_unlock(&queue->lock);
if (req) {
- err = fuse_uring_prepare_send(ent, req);
+ err = fuse_uring_prepare_send(ent, req, issue_flags);
if (err)
goto retry;
}
@@ -1155,11 +1254,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
static void fuse_uring_headers_cleanup(struct fuse_ring_ent *ent,
unsigned int issue_flags)
{
+ u16 headers_index = FUSE_URING_FIXED_HEADERS_OFFSET;
+
if (!ent->queue->use_bufring)
return;
- WARN_ON_ONCE(io_uring_cmd_fixed_index_put(ent->cmd,
- FUSE_URING_FIXED_HEADERS_OFFSET,
+ if (ent->queue->use_zero_copy)
+ headers_index += ent->queue->zero_copy_depth;
+
+ WARN_ON_ONCE(io_uring_cmd_fixed_index_put(ent->cmd, headers_index,
issue_flags));
}
@@ -1167,6 +1270,7 @@ static int fuse_uring_headers_prep(struct fuse_ring_ent *ent, unsigned int dir,
unsigned int issue_flags)
{
size_t header_size = sizeof(struct fuse_uring_req_header);
+ u16 headers_index = FUSE_URING_FIXED_HEADERS_OFFSET;
struct io_uring_cmd *cmd = ent->cmd;
unsigned int offset;
int err;
@@ -1176,11 +1280,15 @@ static int fuse_uring_headers_prep(struct fuse_ring_ent *ent, unsigned int dir,
offset = ent->fixed_buf_id * header_size;
- err = io_uring_cmd_fixed_index_get(cmd, FUSE_URING_FIXED_HEADERS_OFFSET,
- offset, header_size, dir,
+ if (ent->queue->use_zero_copy)
+ headers_index += ent->queue->zero_copy_depth;
+
+ err = io_uring_cmd_fixed_index_get(cmd, headers_index, offset,
+ header_size, dir,
&ent->headers_iter, issue_flags);
WARN_ON_ONCE(err);
+
return err;
}
@@ -1251,7 +1359,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
err = fuse_uring_headers_prep(ent, ITER_SOURCE, issue_flags);
if (err)
- fuse_uring_req_end(ent, req, err);
+ fuse_uring_req_end(ent, req, err, issue_flags);
else
fuse_uring_commit(ent, req, issue_flags);
@@ -1412,6 +1520,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
{
const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
bool use_bufring = READ_ONCE(cmd_req->init.use_bufring);
+ bool zero_copy = READ_ONCE(cmd_req->init.zero_copy);
struct fuse_ring *ring = smp_load_acquire(&fc->ring);
struct fuse_ring_queue *queue;
struct fuse_ring_ent *ent;
@@ -1433,11 +1542,12 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
queue = ring->queues[qid];
if (!queue) {
queue = fuse_uring_create_queue(cmd, ring, qid, use_bufring,
- issue_flags);
+ zero_copy, issue_flags);
if (IS_ERR(queue))
return PTR_ERR(queue);
} else {
- if (queue->use_bufring != use_bufring)
+ if ((queue->use_bufring != use_bufring) ||
+ (queue->use_zero_copy != zero_copy))
return -EINVAL;
}
@@ -1544,7 +1654,7 @@ static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw)
if (fuse_uring_headers_prep(ent, ITER_DEST, issue_flags))
return;
- if (fuse_uring_prepare_send(ent, ent->fuse_req))
+ if (fuse_uring_prepare_send(ent, ent->fuse_req, issue_flags))
send = fuse_uring_get_next_fuse_req(ent, queue, issue_flags);
fuse_uring_headers_cleanup(ent, issue_flags);
if (!send)
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index eff14557066d..b24f89adabc1 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -57,6 +57,9 @@ struct fuse_ring_ent {
*/
unsigned int ringbuf_buf_id;
unsigned int fixed_buf_id;
+
+ /* True if the request's pages are being zero-copied */
+ bool zero_copied;
};
};
@@ -123,6 +126,14 @@ struct fuse_ring_queue {
/* synchronized by the queue lock */
struct io_buffer_list *bufring;
+
+ /*
+ * True if zero copy should be used for payloads. This is only enabled
+ * on privileged servers. Kernel-managed ring buffers must be enabled
+ * in order to use zero copy.
+ */
+ bool use_zero_copy : 1;
+ unsigned int zero_copy_depth;
};
/**
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index aa1d25421054..67b5bed451fe 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -39,6 +39,7 @@ struct fuse_copy_state {
bool is_uring:1;
/* if set, use kaddr; otherwise use pg */
bool is_kaddr:1;
+ bool skip_folio_copy:1;
struct {
unsigned int copied_sz; /* copied size into the user buffer */
} ring;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 3041177e3dd8..f5a67d27f145 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -242,7 +242,7 @@
* - add FUSE_NOTIFY_PRUNE
*
* 7.46
- * - add fuse_uring_cmd_req use_bufring
+ * - add fuse_uring_cmd_req use_bufring, zero_copy, and queue_depth
*/
#ifndef _LINUX_FUSE_H
@@ -1312,10 +1312,12 @@ struct fuse_uring_cmd_req {
union {
struct {
bool use_bufring;
+ bool zero_copy;
+ uint16_t queue_depth;
} init;
};
- uint8_t padding[5];
+ uint8_t padding[2];
};
#endif /* _LINUX_FUSE_H */
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH v2 25/25] docs: fuse: add io-uring bufring and zero-copy documentation
2025-12-18 8:32 [PATCH v2 00/25] fuse/io-uring: add kernel-managed buffer rings and zero-copy Joanne Koong
` (23 preceding siblings ...)
2025-12-18 8:33 ` [PATCH v2 24/25] fuse: add zero-copy over io-uring Joanne Koong
@ 2025-12-18 8:33 ` Joanne Koong
2025-12-21 2:28 ` kernel test robot
24 siblings, 1 reply; 33+ messages in thread
From: Joanne Koong @ 2025-12-18 8:33 UTC (permalink / raw)
To: miklos, axboe
Cc: bschubert, asml.silence, io-uring, csander, xiaobing.li,
linux-fsdevel
Add documentation for fuse over io-uring usage of kernel-managed
bufrings and zero-copy.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
.../filesystems/fuse/fuse-io-uring.rst | 55 ++++++++++++++++++-
1 file changed, 54 insertions(+), 1 deletion(-)
diff --git a/Documentation/filesystems/fuse/fuse-io-uring.rst b/Documentation/filesystems/fuse/fuse-io-uring.rst
index d73dd0dbd238..4c17169069e9 100644
--- a/Documentation/filesystems/fuse/fuse-io-uring.rst
+++ b/Documentation/filesystems/fuse/fuse-io-uring.rst
@@ -95,5 +95,58 @@ Sending requests with CQEs
| <fuse_unlink() |
| <sys_unlink() |
+Kernel-managed buffer rings
+===========================
-
+Kernel-managed buffer rings have two main advantages:
+* eliminates the overhead of pinning/unpinning user pages and translating
+ virtual addresses for every server-kernel interaction
+* reduces buffer memory allocation requirements
+
+In order to use buffer rings, the server must preregister the following:
+* a fixed buffer at index 0. This is where the headers will reside
+* a kernel-managed buffer ring. This is where the payload will reside
+
+At a high-level, this is how fuse uses buffer rings:
+* The server registers a kernel-managed buffer ring. In the kernel this
+ allocates the pages needed for the buffers and vmaps them. The server
+ obtains the virtual address for the buffers through an mmap call on the ring
+ fd.
+* When there is a request from a client, fuse will select a buffer from the
+ ring if there is any payload that needs to be copied, copy over the payload
+ to the selected buffer, and copy over the headers to the fixed buffer at
+ index 0, at the buffer id that corresponds to the server (which the server
+ needs to specify through sqe->buf_index).
+* The server obtains a cqe representing the request. The cqe flag will have
+ IORING_CQE_F_BUFFER set if a selected buffer was used for the payload. The
+ buffer id is stashed in cqe->flags (through IORING_CQE_BUFFER_SHIFT). The
+ server can directly access the payload by using that buffer id to calculate
+ the offset into the virtual address obtained for the buffers.
+* The server processes the request and then sends a
+ FUSE_URING_CMD_COMMIT_AND_FETCH sqe with the reply.
+* When the kernel handles the sqe, it will process the reply and if there is a
+ next request, it will reuse the same selected buffer for the request. If
+ there is no next request, it will recycle the buffer back to the ring.
+
+Zero-copy
+=========
+
+Fuse io-uring zero-copy allows the server to directly read from / write to the
+client's pages and bypass any intermediary buffer copies. This is only allowed
+on privileged servers.
+
+In order to use zero-copy, the server must pregister the following:
+* a sparse buffer for every entry in the queue. This is where the client's
+ pages will reside
+* a fixed buffer at index queue_depth (tailing the sparse buffer).
+ This is where the headers will reside
+* a kernel-managed buffer ring. This is where any non-zero-copied payload (eg
+ out headers) will reside
+
+When the client issues a read/write, fuse stores the client's underlying pages
+in the sparse buffer entry corresponding to the ent in the queue. The server
+can then issue reads/writes on these pages through io_uring rw operations.
+Please note that the server is not able to directly access these pages, it
+must go through the io-uring interface to read/write to them. The pages are
+unregistered once the server replies to the request. Non-zero-copyable
+payload (if needed) is placed in a buffer from the kernel-managed buffer ring.
--
2.47.3
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH v2 25/25] docs: fuse: add io-uring bufring and zero-copy documentation
2025-12-18 8:33 ` [PATCH v2 25/25] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
@ 2025-12-21 2:28 ` kernel test robot
0 siblings, 0 replies; 33+ messages in thread
From: kernel test robot @ 2025-12-21 2:28 UTC (permalink / raw)
To: Joanne Koong, miklos, axboe
Cc: oe-kbuild-all, bschubert, asml.silence, io-uring, csander,
xiaobing.li, linux-fsdevel
Hi Joanne,
kernel test robot noticed the following build warnings:
[auto build test WARNING on axboe/for-next]
[also build test WARNING on linus/master v6.19-rc1 next-20251219]
[cannot apply to mszeredi-fuse/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Joanne-Koong/io_uring-kbuf-refactor-io_buf_pbuf_register-logic-into-generic-helpers/20251218-165107
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20251218083319.3485503-26-joannelkoong%40gmail.com
patch subject: [PATCH v2 25/25] docs: fuse: add io-uring bufring and zero-copy documentation
reproduce: (https://download.01.org/0day-ci/archive/20251221/202512210331.Yc46M5Rg-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512210331.Yc46M5Rg-lkp@intel.com/
All warnings (new ones prefixed by >>):
ERROR: Cannot find file ./include/linux/fscache.h
WARNING: No kernel-doc for file ./include/linux/fscache.h
ERROR: Cannot find file ./include/linux/fiemap.h
WARNING: No kernel-doc for file ./include/linux/fiemap.h
Documentation/filesystems/fuse/fuse-io-uring.rst:103: ERROR: Unexpected indentation. [docutils]
>> Documentation/filesystems/fuse/fuse-io-uring.rst:104: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]
Documentation/filesystems/fuse/fuse-io-uring.rst:112: ERROR: Unexpected indentation. [docutils]
Documentation/filesystems/fuse/fuse-io-uring.rst:115: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]
Documentation/filesystems/fuse/fuse-io-uring.rst:140: ERROR: Unexpected indentation. [docutils]
Documentation/filesystems/fuse/fuse-io-uring.rst:141: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]
ERROR: Cannot find file ./include/linux/jbd2.h
vim +104 Documentation/filesystems/fuse/fuse-io-uring.rst
100
101 Kernel-managed buffer rings have two main advantages:
102 * eliminates the overhead of pinning/unpinning user pages and translating
> 103 virtual addresses for every server-kernel interaction
> 104 * reduces buffer memory allocation requirements
105
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 33+ messages in thread