From: Bernd Schubert <bernd@bsbernd.com>
To: Joanne Koong <joannelkoong@gmail.com>,
miklos@szeredi.hu, axboe@kernel.dk
Cc: linux-fsdevel@vger.kernel.org, bschubert@ddn.com,
asml.silence@gmail.com, io-uring@vger.kernel.org,
xiaobing.li@samsung.com, csander@purestorage.com,
kernel-team@meta.com
Subject: Re: [PATCH v2 8/8] fuse: support io-uring registered buffers
Date: Thu, 6 Nov 2025 20:48:38 +0100 [thread overview]
Message-ID: <a335fd2c-03ca-4201-abcf-74809b84c426@bsbernd.com> (raw)
In-Reply-To: <20251027222808.2332692-9-joannelkoong@gmail.com>
On 10/27/25 23:28, Joanne Koong wrote:
> Add support for io-uring registered buffers for fuse daemons
> communicating through the io-uring interface. Daemons may register
> buffers ahead of time, which will eliminate the overhead of
> pinning/unpinning user pages and translating virtual addresses for every
> server-kernel interaction.
>
> To support page-aligned payloads, the buffer is structured such that the
> payload is at the front of the buffer and the fuse_uring_req_header is
> offset from the end of the buffer.
>
> To be backwards compatible, fuse uring still needs to support non-registered
> buffers as well.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
> fs/fuse/dev_uring.c | 200 +++++++++++++++++++++++++++++++++---------
> fs/fuse/dev_uring_i.h | 27 +++++-
> 2 files changed, 183 insertions(+), 44 deletions(-)
>
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index c6b22b14b354..f501bc81f331 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -580,6 +580,22 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
> return err;
> }
>
> +static void *get_kernel_ring_header(struct fuse_ring_ent *ent,
> + enum fuse_uring_header_type type)
> +{
> + switch (type) {
> + case FUSE_URING_HEADER_IN_OUT:
> + return &ent->headers->in_out;
> + case FUSE_URING_HEADER_OP:
> + return &ent->headers->op_in;
> + case FUSE_URING_HEADER_RING_ENT:
> + return &ent->headers->ring_ent_in_out;
> + }
> +
> + WARN_ON_ONCE(1);
> + return NULL;
> +}
> +
> static void __user *get_user_ring_header(struct fuse_ring_ent *ent,
> enum fuse_uring_header_type type)
> {
> @@ -600,16 +616,22 @@ static int copy_header_to_ring(struct fuse_ring_ent *ent,
> enum fuse_uring_header_type type,
> const void *header, size_t header_size)
> {
> - void __user *ring = get_user_ring_header(ent, type);
> + if (ent->fixed_buffer) {
> + void *ring = get_kernel_ring_header(ent, type);
>
> - if (!ring)
> - return -EINVAL;
> + if (!ring)
> + return -EINVAL;
> + memcpy(ring, header, header_size);
> + } else {
> + void __user *ring = get_user_ring_header(ent, type);
>
> - if (copy_to_user(ring, header, header_size)) {
> - pr_info_ratelimited("Copying header to ring failed.\n");
> - return -EFAULT;
> + if (!ring)
> + return -EINVAL;
> + if (copy_to_user(ring, header, header_size)) {
> + pr_info_ratelimited("Copying header to ring failed.\n");
> + return -EFAULT;
> + }
> }
> -
> return 0;
> }
>
> @@ -617,14 +639,21 @@ static int copy_header_from_ring(struct fuse_ring_ent *ent,
> enum fuse_uring_header_type type,
> void *header, size_t header_size)
> {
> - const void __user *ring = get_user_ring_header(ent, type);
> + if (ent->fixed_buffer) {
> + const void *ring = get_kernel_ring_header(ent, type);
>
> - if (!ring)
> - return -EINVAL;
> + if (!ring)
> + return -EINVAL;
> + memcpy(header, ring, header_size);
> + } else {
> + const void __user *ring = get_user_ring_header(ent, type);
>
> - if (copy_from_user(header, ring, header_size)) {
> - pr_info_ratelimited("Copying header from ring failed.\n");
> - return -EFAULT;
> + if (!ring)
> + return -EINVAL;
> + if (copy_from_user(header, ring, header_size)) {
> + pr_info_ratelimited("Copying header from ring failed.\n");
> + return -EFAULT;
> + }
> }
>
> return 0;
> @@ -637,11 +666,15 @@ static int setup_fuse_copy_state(struct fuse_ring *ring, struct fuse_req *req,
> {
> int err;
>
> - err = import_ubuf(rw, ent->user_payload, ring->max_payload_sz,
> - iter);
> - if (err) {
> - pr_info_ratelimited("fuse: Import of user buffer failed\n");
> - return err;
> + if (ent->fixed_buffer) {
> + *iter = ent->payload_iter;
> + } else {
> + err = import_ubuf(rw, ent->user_payload, ring->max_payload_sz,
> + iter);
> + if (err) {
> + pr_info_ratelimited("fuse: Import of user buffer failed\n");
> + return err;
> + }
> }
>
> fuse_copy_init(cs, rw == ITER_DEST, iter);
> @@ -754,6 +787,62 @@ static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
> sizeof(req->in.h));
> }
>
> +/*
> + * Prepare fixed buffer for access. Sets up the payload iter and kmaps the
> + * header.
> + *
> + * Callers must call fuse_uring_unmap_buffer() in the same scope to release the
> + * header mapping.
> + *
> + * For non-fixed buffers, this is a no-op.
> + */
> +static int fuse_uring_map_buffer(struct fuse_ring_ent *ent)
> +{
> + size_t header_size = sizeof(struct fuse_uring_req_header);
> + struct iov_iter iter;
> + struct page *header_page;
> + size_t count, start;
> + ssize_t copied;
> + int err;
> +
> + if (!ent->fixed_buffer)
> + return 0;
> +
> + err = io_uring_cmd_import_fixed_full(ITER_DEST, &iter, ent->cmd, 0);
This seems to be a rather expensive call, especially as it gets
called twice (during submit and fetch).
Wouldn't be there be a possibility to check if the user buffer changed
and then keep the existing iter? I think Caleb had a similar idea
in patch 1/8.
> + if (err)
> + return err;
> +
> + count = iov_iter_count(&iter);
> + if (count < header_size || count & (PAGE_SIZE - 1))
> + return -EINVAL;
|| !PAGE_ALIGNED(count)) ?
> +
> + /* Adjust the payload iter to protect the header from any overwrites */
> + ent->payload_iter = iter;
> + iov_iter_truncate(&ent->payload_iter, count - header_size);
> +
> + /* Set up the headers */
> + iov_iter_advance(&iter, count - header_size);
> + copied = iov_iter_get_pages2(&iter, &header_page, header_size, 1, &start);
The iter is later used for the payload, but I miss a reset? iov_iter_revert()?
> + if (copied < header_size)
> + return -EFAULT;
> + ent->headers = kmap_local_page(header_page) + start;
My plan for the alternative pinning patch (with io-uring) was to let the
header be shared by multiple entries. Current libfuse master handles
a fixed page size buffer for the payload (prepared page pinning - I
didn't expect I was blocked for 9 months on other work), missing is to
share it between ring entries.
I think this wouldn't work with registered buffer approach - it
always needs one full page?
I would also like to discuss dynamic multiple payload sizes per queue.
For example to have something like
256 x 4K
8 x 128K
4 x 1M
I think there are currently two ways to do that
1) Sort entries into pools
2) Sort buffers into pools and let entries use these. Here the header
would be fixed and payload would come from a pool.
With the appraoch to have payload and header in one buffer we couldn't
use 2). Using 1) should be fine, though.
> +
> + /*
> + * We can release the acquired reference on the header page immediately
> + * since the page is pinned and io_uring_cmd_import_fixed_full()
> + * prevents it from being unpinned while we are using it.
> + */
> + put_page(header_page);
> +
> + return 0;
> +}
> +
> +static void fuse_uring_unmap_buffer(struct fuse_ring_ent *ent)
> +{
> + if (ent->fixed_buffer)
> + kunmap_local(ent->headers);
> +}
> +
> static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
> struct fuse_req *req)
> {
> @@ -932,6 +1021,7 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
> unsigned int qid = READ_ONCE(cmd_req->qid);
> struct fuse_pqueue *fpq;
> struct fuse_req *req;
> + bool next_req;
>
> err = -ENOTCONN;
> if (!ring)
> @@ -982,6 +1072,13 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
>
> /* without the queue lock, as other locks are taken */
> fuse_uring_prepare_cancel(cmd, issue_flags, ent);
> +
> + err = fuse_uring_map_buffer(ent);
> + if (err) {
> + fuse_uring_req_end(ent, req, err);
> + return err;
> + }
> +
> fuse_uring_commit(ent, req, issue_flags);
>
> /*
> @@ -990,7 +1087,9 @@ static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
> * and fetching is done in one step vs legacy fuse, which has separated
> * read (fetch request) and write (commit result).
> */
> - if (fuse_uring_get_next_fuse_req(ent, queue))
> + next_req = fuse_uring_get_next_fuse_req(ent, queue);
> + fuse_uring_unmap_buffer(ent);
> + if (next_req)
> fuse_uring_send(ent, cmd, 0, issue_flags);
> return 0;
> }
> @@ -1086,39 +1185,49 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
> struct iovec iov[FUSE_URING_IOV_SEGS];
> int err;
>
> + err = -ENOMEM;
> + ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
> + if (!ent)
> + return ERR_PTR(err);
> +
> + INIT_LIST_HEAD(&ent->list);
> +
> + ent->queue = queue;
> +
> + if (READ_ONCE(cmd->sqe->uring_cmd_flags) & IORING_URING_CMD_FIXED) {
> + ent->fixed_buffer = true;
> + atomic_inc(&ring->queue_refs);
> + return ent;
> + }
> +
> err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
> if (err) {
> pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
> err);
> - return ERR_PTR(err);
> + goto error;
> }
>
> err = -EINVAL;
> if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
> pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
> - return ERR_PTR(err);
> + goto error;
> }
>
> payload_size = iov[1].iov_len;
> if (payload_size < ring->max_payload_sz) {
> pr_info_ratelimited("Invalid req payload len %zu\n",
> payload_size);
> - return ERR_PTR(err);
> + goto error;
> }
> -
> - err = -ENOMEM;
> - ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
> - if (!ent)
> - return ERR_PTR(err);
> -
> - INIT_LIST_HEAD(&ent->list);
> -
> - ent->queue = queue;
> ent->user_headers = iov[0].iov_base;
> ent->user_payload = iov[1].iov_base;
>
> atomic_inc(&ring->queue_refs);
> return ent;
> +
> +error:
> + kfree(ent);
> + return ERR_PTR(err);
> }
>
> /*
> @@ -1249,20 +1358,29 @@ static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
> {
> struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
> struct fuse_ring_queue *queue = ent->queue;
> + bool send_ent = true;
> int err;
>
> - if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
> - err = fuse_uring_prepare_send(ent, ent->fuse_req);
> - if (err) {
> - if (!fuse_uring_get_next_fuse_req(ent, queue))
> - return;
> - err = 0;
> - }
> - } else {
> - err = -ECANCELED;
> + if (issue_flags & IO_URING_F_TASK_DEAD) {
> + fuse_uring_send(ent, cmd, -ECANCELED, issue_flags);
> + return;
> + }
> +
> + err = fuse_uring_map_buffer(ent);
> + if (err) {
> + fuse_uring_req_end(ent, ent->fuse_req, err);
> + return;
I think this needs to abort the connection now. There could be multiple
commands on the queue and they would be stuck now and there is no
notification to fuse server either.
> + }
> +
> + err = fuse_uring_prepare_send(ent, ent->fuse_req);
> + if (err) {
> + send_ent = fuse_uring_get_next_fuse_req(ent, queue);
> + err = 0;
> }
> + fuse_uring_unmap_buffer(ent);
>
> - fuse_uring_send(ent, cmd, err, issue_flags);
> + if (send_ent)
> + fuse_uring_send(ent, cmd, err, issue_flags);
> }
>
> static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> index 381fd0b8156a..fe14acccd6a6 100644
> --- a/fs/fuse/dev_uring_i.h
> +++ b/fs/fuse/dev_uring_i.h
> @@ -7,6 +7,7 @@
> #ifndef _FS_FUSE_DEV_URING_I_H
> #define _FS_FUSE_DEV_URING_I_H
>
> +#include <linux/uio.h>
> #include "fuse_i.h"
>
> #ifdef CONFIG_FUSE_IO_URING
> @@ -38,9 +39,29 @@ enum fuse_ring_req_state {
>
> /** A fuse ring entry, part of the ring queue */
> struct fuse_ring_ent {
> - /* userspace buffer */
> - struct fuse_uring_req_header __user *user_headers;
> - void __user *user_payload;
> + /*
> + * If true, the buffer was pre-registered by the daemon and the
> + * pages backing it are pinned in kernel memory. The fixed buffer layout
> + * is: [payload][header at end]. Use payload_iter and headers for
> + * copying to/from the ring.
> + *
> + * Otherwise, use user_headers and user_payload which point to userspace
> + * addresses representing the ring memory.
> + */
> + bool fixed_buffer;
> +
> + union {
> + /* fixed_buffer == false */
> + struct {
> + struct fuse_uring_req_header __user *user_headers;
> + void __user *user_payload;
> + };
> + /* fixed_buffer == true */
> + struct {
> + struct fuse_uring_req_header *headers;
> + struct iov_iter payload_iter;
> + };
> + };
>
> /* the ring queue that owns the request */
> struct fuse_ring_queue *queue;
Thanks,
Bernd
next prev parent reply other threads:[~2025-11-06 19:48 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-27 22:27 [PATCH v2 0/8] fuse: support io-uring registered buffers Joanne Koong
2025-10-27 22:28 ` [PATCH v2 1/8] io_uring/uring_cmd: add io_uring_cmd_import_fixed_full() Joanne Koong
2025-10-28 1:28 ` Caleb Sander Mateos
2025-10-29 14:01 ` Pavel Begunkov
2025-10-29 18:37 ` Joanne Koong
2025-10-29 19:59 ` Bernd Schubert
2025-10-30 17:42 ` Pavel Begunkov
2025-10-30 18:06 ` Pavel Begunkov
2025-10-30 22:23 ` Bernd Schubert
2025-10-30 23:50 ` Joanne Koong
2025-10-31 10:27 ` Bernd Schubert
2025-10-31 21:19 ` Joanne Koong
2025-10-30 23:13 ` Joanne Koong
2025-10-27 22:28 ` [PATCH v2 2/8] fuse: refactor io-uring logic for getting next fuse request Joanne Koong
2025-10-30 23:07 ` Bernd Schubert
2025-10-27 22:28 ` [PATCH v2 3/8] fuse: refactor io-uring header copying to ring Joanne Koong
2025-10-30 23:15 ` Bernd Schubert
2025-10-30 23:52 ` Joanne Koong
2025-10-27 22:28 ` [PATCH v2 4/8] fuse: refactor io-uring header copying from ring Joanne Koong
2025-10-27 22:28 ` [PATCH v2 5/8] fuse: use enum types for header copying Joanne Koong
2025-11-05 23:01 ` Bernd Schubert
2025-11-06 21:59 ` Joanne Koong
2025-11-07 22:11 ` Bernd Schubert
2025-10-27 22:28 ` [PATCH v2 6/8] fuse: add user_ prefix to userspace headers and payload fields Joanne Koong
2025-10-28 1:32 ` Caleb Sander Mateos
2025-10-28 23:56 ` Joanne Koong
2025-11-06 13:35 ` Bernd Schubert
2025-10-27 22:28 ` [PATCH v2 7/8] fuse: refactor setting up copy state for payload copying Joanne Koong
2025-11-06 16:53 ` Bernd Schubert
2025-11-06 22:01 ` Joanne Koong
2025-10-27 22:28 ` [PATCH v2 8/8] fuse: support io-uring registered buffers Joanne Koong
2025-10-28 1:42 ` Caleb Sander Mateos
2025-10-28 23:56 ` Joanne Koong
2025-11-06 19:48 ` Bernd Schubert [this message]
2025-11-06 23:09 ` Joanne Koong
2025-11-07 22:16 ` Bernd Schubert
2025-11-07 22:23 ` Bernd Schubert
2025-11-23 20:12 ` Bernd Schubert
2025-11-25 1:13 ` Joanne Koong
2025-11-14 23:59 ` [PATCH v2 0/8] " Joanne Koong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a335fd2c-03ca-4201-abcf-74809b84c426@bsbernd.com \
--to=bernd@bsbernd.com \
--cc=asml.silence@gmail.com \
--cc=axboe@kernel.dk \
--cc=bschubert@ddn.com \
--cc=csander@purestorage.com \
--cc=io-uring@vger.kernel.org \
--cc=joannelkoong@gmail.com \
--cc=kernel-team@meta.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=miklos@szeredi.hu \
--cc=xiaobing.li@samsung.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox