From: Caleb Sander Mateos <[email protected]>
To: Keith Busch <[email protected]>
Cc: [email protected], [email protected], [email protected],
[email protected], [email protected],
[email protected], Keith Busch <[email protected]>
Subject: Re: [PATCHv5 07/11] io_uring: add support for kernel registered bvecs
Date: Tue, 25 Feb 2025 12:58:43 -0800 [thread overview]
Message-ID: <CADUfDZreF+YLLkE4Z+UniVXBo7HRm9nTd+O9yRVqJ9STpFJaJA@mail.gmail.com> (raw)
In-Reply-To: <[email protected]>
On Mon, Feb 24, 2025 at 1:31 PM Keith Busch <[email protected]> wrote:
>
> From: Keith Busch <[email protected]>
>
> Provide an interface for the kernel to leverage the existing
> pre-registered buffers that io_uring provides. User space can reference
> these later to achieve zero-copy IO.
>
> User space must register an empty fixed buffer table with io_uring in
> order for the kernel to make use of it.
>
> Signed-off-by: Keith Busch <[email protected]>
> ---
> include/linux/io_uring/cmd.h | 7 ++
> io_uring/rsrc.c | 123 +++++++++++++++++++++++++++++++++--
> io_uring/rsrc.h | 8 +++
> 3 files changed, 131 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
> index 87150dc0a07cf..cf8d80d847344 100644
> --- a/include/linux/io_uring/cmd.h
> +++ b/include/linux/io_uring/cmd.h
> @@ -4,6 +4,7 @@
>
> #include <uapi/linux/io_uring.h>
> #include <linux/io_uring_types.h>
> +#include <linux/blk-mq.h>
>
> /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
> #define IORING_URING_CMD_CANCELABLE (1U << 30)
> @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
> return cmd_to_io_kiocb(cmd)->async_data;
> }
>
> +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
> + void (*release)(void *), unsigned int index,
> + unsigned int issue_flags);
> +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
> + unsigned int issue_flags);
> +
> #endif /* _LINUX_IO_URING_CMD_H */
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index f814526982c36..e0c6ed3aef5b5 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -9,6 +9,7 @@
> #include <linux/hugetlb.h>
> #include <linux/compat.h>
> #include <linux/io_uring.h>
> +#include <linux/io_uring/cmd.h>
>
> #include <uapi/linux/io_uring.h>
>
> @@ -104,14 +105,21 @@ int io_buffer_validate(struct iovec *iov)
> static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> {
> struct io_mapped_ubuf *imu = node->buf;
> - unsigned int i;
>
> if (!refcount_dec_and_test(&imu->refs))
> return;
> - for (i = 0; i < imu->nr_bvecs; i++)
> - unpin_user_page(imu->bvec[i].bv_page);
> - if (imu->acct_pages)
> - io_unaccount_mem(ctx, imu->acct_pages);
> +
> + if (imu->release) {
> + imu->release(imu->priv);
> + } else {
> + unsigned int i;
> +
> + for (i = 0; i < imu->nr_bvecs; i++)
> + unpin_user_page(imu->bvec[i].bv_page);
> + if (imu->acct_pages)
> + io_unaccount_mem(ctx, imu->acct_pages);
> + }
> +
> kvfree(imu);
> }
>
> @@ -761,6 +769,9 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> imu->len = iov->iov_len;
> imu->nr_bvecs = nr_pages;
> imu->folio_shift = PAGE_SHIFT;
> + imu->release = NULL;
> + imu->priv = NULL;
> + imu->perm = IO_IMU_READABLE | IO_IMU_WRITEABLE;
> if (coalesced)
> imu->folio_shift = data.folio_shift;
> refcount_set(&imu->refs, 1);
> @@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> return ret;
> }
>
> +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
> + void (*release)(void *), unsigned int index,
> + unsigned int issue_flags)
> +{
> + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
> + struct io_rsrc_data *data = &ctx->buf_table;
> + struct req_iterator rq_iter;
> + struct io_mapped_ubuf *imu;
> + struct io_rsrc_node *node;
> + struct bio_vec bv, *bvec;
> + u16 nr_bvecs;
> + int ret = 0;
> +
> +
> + io_ring_submit_lock(ctx, issue_flags);
> + if (index >= data->nr) {
> + ret = -EINVAL;
> + goto unlock;
> + }
> + index = array_index_nospec(index, data->nr);
> +
> + if (data->nodes[index] ) {
nit: extra space before )
> + ret = -EBUSY;
> + goto unlock;
> + }
> +
> + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + if (!node) {
> + ret = -ENOMEM;
> + goto unlock;
> + }
> +
> + nr_bvecs = blk_rq_nr_phys_segments(rq);
> + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> + if (!imu) {
> + kfree(node);
> + ret = -ENOMEM;
> + goto unlock;
> + }
> +
> + imu->ubuf = 0;
> + imu->len = blk_rq_bytes(rq);
> + imu->acct_pages = 0;
> + imu->folio_shift = PAGE_SHIFT;
> + imu->nr_bvecs = nr_bvecs;
> + refcount_set(&imu->refs, 1);
> + imu->release = release;
> + imu->priv = rq;
> +
> + if (op_is_write(req_op(rq)))
> + imu->perm = IO_IMU_WRITEABLE;
> + else
> + imu->perm = IO_IMU_READABLE;
imu->perm = 1 << rq_data_dir(rq); ?
> +
> + bvec = imu->bvec;
> + rq_for_each_bvec(bv, rq, rq_iter)
> + *bvec++ = bv;
> +
> + node->buf = imu;
> + data->nodes[index] = node;
> +unlock:
> + io_ring_submit_unlock(ctx, issue_flags);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
> +
> +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
> + unsigned int issue_flags)
> +{
> + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
> + struct io_rsrc_data *data = &ctx->buf_table;
> + struct io_rsrc_node *node;
> +
> + io_ring_submit_lock(ctx, issue_flags);
> + if (index >= data->nr)
> + goto unlock;
> + index = array_index_nospec(index, data->nr);
> +
> + node = data->nodes[index];
> + if (!node || !node->buf->release)
> + goto unlock;
Would it be useful to return some error code in these cases so
userspace can tell that the unregistration parameters were invalid?
Best,
Caleb
> +
> + io_put_rsrc_node(ctx, node);
> + data->nodes[index] = NULL;
> +unlock:
> + io_ring_submit_unlock(ctx, issue_flags);
> +}
> +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
> +
> static int io_import_fixed(int ddir, struct iov_iter *iter,
> struct io_mapped_ubuf *imu,
> u64 buf_addr, size_t len)
> @@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
> /* not inside the mapped region */
> if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
> return -EFAULT;
> + if (!(imu->perm & (1 << ddir)))
> + return -EFAULT;
>
> /*
> * Might not be a start of buffer, set size appropriately
> @@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
> /*
> * Don't use iov_iter_advance() here, as it's really slow for
> * using the latter parts of a big fixed buffer - it iterates
> - * over each segment manually. We can cheat a bit here, because
> - * we know that:
> + * over each segment manually. We can cheat a bit here for user
> + * registered nodes, because we know that:
> *
> * 1) it's a BVEC iter, we set it up
> * 2) all bvecs are the same in size, except potentially the
> @@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
> */
> const struct bio_vec *bvec = imu->bvec;
>
> + /*
> + * Kernel buffer bvecs, on the other hand, don't necessarily
> + * have the size property of user registered ones, so we have
> + * to use the slow iter advance.
> + */
> if (offset < bvec->bv_len) {
> iter->iov_offset = offset;
> + } else if (imu->release) {
> + iov_iter_advance(iter, offset);
> } else {
> unsigned long seg_skip;
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index f0e9080599646..64bf35667cf9c 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -20,6 +20,11 @@ struct io_rsrc_node {
> };
> };
>
> +enum {
> + IO_IMU_READABLE = 1 << 0,
> + IO_IMU_WRITEABLE = 1 << 1,
> +};
> +
> struct io_mapped_ubuf {
> u64 ubuf;
> unsigned int len;
> @@ -27,6 +32,9 @@ struct io_mapped_ubuf {
> unsigned int folio_shift;
> refcount_t refs;
> unsigned long acct_pages;
> + void (*release)(void *);
> + void *priv;
> + u8 perm;
> struct bio_vec bvec[] __counted_by(nr_bvecs);
> };
>
> --
> 2.43.5
>
next prev parent reply other threads:[~2025-02-25 20:58 UTC|newest]
Thread overview: 51+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-24 21:31 [PATCHv5 00/11] ublk zero copy support Keith Busch
2025-02-24 21:31 ` [PATCHv5 01/11] io_uring/rsrc: remove redundant check for valid imu Keith Busch
2025-02-25 8:37 ` Ming Lei
2025-02-25 13:13 ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 02/11] io_uring/nop: reuse req->buf_index Keith Busch
2025-02-24 23:30 ` Jens Axboe
2025-02-25 0:02 ` Keith Busch
2025-02-25 8:43 ` Ming Lei
2025-02-25 13:13 ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 03/11] io_uring/net: reuse req->buf_index for sendzc Keith Busch
2025-02-25 8:44 ` Ming Lei
2025-02-25 13:14 ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 04/11] io_uring/nvme: pass issue_flags to io_uring_cmd_import_fixed() Keith Busch
2025-02-25 8:52 ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 05/11] io_uring: combine buffer lookup and import Keith Busch
2025-02-25 8:55 ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 06/11] io_uring/rw: move fixed buffer import to issue path Keith Busch
2025-02-25 9:26 ` Ming Lei
2025-02-25 13:57 ` Pavel Begunkov
2025-02-25 20:57 ` Caleb Sander Mateos
2025-02-25 21:16 ` Keith Busch
2025-02-24 21:31 ` [PATCHv5 07/11] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-25 9:40 ` Ming Lei
2025-02-25 17:32 ` Keith Busch
2025-02-25 22:47 ` Ming Lei
2025-02-25 22:55 ` Keith Busch
2025-02-25 14:00 ` Pavel Begunkov
2025-02-25 14:05 ` Pavel Begunkov
2025-02-25 20:58 ` Caleb Sander Mateos [this message]
2025-02-24 21:31 ` [PATCHv5 08/11] nvme: map uring_cmd data even if address is 0 Keith Busch
2025-02-25 9:41 ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 09/11] ublk: zc register/unregister bvec Keith Busch
2025-02-25 11:00 ` Ming Lei
2025-02-25 16:35 ` Keith Busch
2025-02-25 22:56 ` Ming Lei
2025-02-25 16:19 ` Pavel Begunkov
2025-02-25 16:27 ` Keith Busch
2025-02-25 16:42 ` Pavel Begunkov
2025-02-25 16:52 ` Keith Busch
2025-02-27 4:16 ` Ming Lei
2025-02-25 21:14 ` Caleb Sander Mateos
2025-02-26 8:15 ` Ming Lei
2025-02-26 17:10 ` Keith Busch
2025-02-27 4:19 ` Ming Lei
2025-02-24 21:31 ` [PATCHv5 10/11] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-25 16:04 ` Pavel Begunkov
2025-02-24 21:31 ` [PATCHv5 11/11] io_uring: cache nodes and mapped buffers Keith Busch
2025-02-25 13:11 ` Pavel Begunkov
2025-02-25 14:10 ` [PATCHv5 00/11] ublk zero copy support Pavel Begunkov
2025-02-25 14:47 ` Jens Axboe
2025-02-25 15:07 ` (subset) " Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CADUfDZreF+YLLkE4Z+UniVXBo7HRm9nTd+O9yRVqJ9STpFJaJA@mail.gmail.com \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox