From: Caleb Sander Mateos <[email protected]>
To: Keith Busch <[email protected]>
Cc: [email protected], [email protected], [email protected],
[email protected], [email protected],
[email protected], Keith Busch <[email protected]>
Subject: Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
Date: Sun, 16 Feb 2025 14:43:40 -0800 [thread overview]
Message-ID: <CADUfDZrfmpy3hxD5z0ADxCUkWPbU0aZDMiqpyPE+AZbeDSgZ=g@mail.gmail.com> (raw)
In-Reply-To: <CADUfDZpM-TXBYQy0B4xRnKjT=-OfX+AYo+6HGA7e7pyT39LxEA@mail.gmail.com>
On Fri, Feb 14, 2025 at 6:22 PM Caleb Sander Mateos
<[email protected]> wrote:
>
> On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <[email protected]> wrote:
> >
> > From: Keith Busch <[email protected]>
> >
> > Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> > more efficiently reuse these buffers.
> >
> > Signed-off-by: Keith Busch <[email protected]>
> > ---
> > include/linux/io_uring_types.h | 18 +++---
> > io_uring/filetable.c | 2 +-
> > io_uring/rsrc.c | 114 +++++++++++++++++++++++++--------
> > io_uring/rsrc.h | 2 +-
> > 4 files changed, 99 insertions(+), 37 deletions(-)
> >
> > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > index d8d717cce427f..ebaaa1c7e210f 100644
> > --- a/include/linux/io_uring_types.h
> > +++ b/include/linux/io_uring_types.h
> > @@ -67,8 +67,18 @@ struct io_file_table {
> > unsigned int alloc_hint;
> > };
> >
> > +struct io_alloc_cache {
> > + void **entries;
> > + unsigned int nr_cached;
> > + unsigned int max_cached;
> > + size_t elem_size;
> > + unsigned int init_clear;
> > +};
> > +
> > struct io_buf_table {
> > struct io_rsrc_data data;
> > + struct io_alloc_cache node_cache;
> > + struct io_alloc_cache imu_cache;
> > };
> >
> > struct io_hash_bucket {
> > @@ -222,14 +232,6 @@ struct io_submit_state {
> > struct blk_plug plug;
> > };
> >
> > -struct io_alloc_cache {
> > - void **entries;
> > - unsigned int nr_cached;
> > - unsigned int max_cached;
> > - unsigned int elem_size;
> > - unsigned int init_clear;
> > -};
> > -
> > struct io_ring_ctx {
> > /* const or read-mostly hot data */
> > struct {
> > diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> > index dd8eeec97acf6..a21660e3145ab 100644
> > --- a/io_uring/filetable.c
> > +++ b/io_uring/filetable.c
> > @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
> > if (slot_index >= ctx->file_table.data.nr)
> > return -EINVAL;
> >
> > - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> > if (!node)
> > return -ENOMEM;
> >
> > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> > index fd7a1b04db8b7..26ff9b5851d94 100644
> > --- a/io_uring/rsrc.c
> > +++ b/io_uring/rsrc.c
> > @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > #define IORING_MAX_FIXED_FILES (1U << 20)
> > #define IORING_MAX_REG_BUFFERS (1U << 14)
> >
> > +#define IO_CACHED_BVECS_SEGS 32
> > +
> > int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
> > {
> > unsigned long page_limit, cur_pages, new_pages;
> > @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> > kvfree(imu);
> > }
> >
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> > +
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
>
> nit: extra blank line added here
>
> > {
> > struct io_rsrc_node *node;
> >
> > - node = kzalloc(sizeof(*node), GFP_KERNEL);
> > + if (type == IORING_RSRC_FILE)
> > + node = kmalloc(sizeof(*node), GFP_KERNEL);
> > + else
> > + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
> > if (node) {
> > node->type = type;
> > node->refs = 1;
> > + node->tag = 0;
> > + node->file_ptr = 0;
> > }
> > return node;
> > }
> >
> > -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> > +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> > +{
> > + kvfree(data->nodes);
> > + data->nodes = NULL;
> > + data->nr = 0;
> > +}
> > +
> > +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> > + struct io_rsrc_data *data)
> > {
> > if (!data->nr)
> > return;
> > @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
> > if (data->nodes[data->nr])
> > io_put_rsrc_node(ctx, data->nodes[data->nr]);
> > }
> > - kvfree(data->nodes);
> > - data->nodes = NULL;
> > - data->nr = 0;
> > + __io_rsrc_data_free(data);
> > }
> >
> > __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> > @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> > return -ENOMEM;
> > }
> >
> > +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> > +{
> > + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> > + IO_CACHED_BVECS_SEGS);
> > + int ret;
> > +
> > + ret = io_rsrc_data_alloc(&table->data, nr);
> > + if (ret)
> > + return ret;
> > +
> > + ret = io_alloc_cache_init(&table->node_cache, nr,
> > + sizeof(struct io_rsrc_node), 0);
> > + if (ret)
> > + goto out_1;
> > +
> > + ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
> > + if (ret)
> > + goto out_2;
>
> io_alloc_cache_init() returns bool. Probably these cases should return
> -ENOMEM instead of 1?
>
> > +
> > + return 0;
> > +out_2:
> > + io_alloc_cache_free(&table->node_cache, kfree);
> > +out_1:
> > + __io_rsrc_data_free(&table->data);
> > + return ret;
> > +}
> > +
> > static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> > struct io_uring_rsrc_update2 *up,
> > unsigned nr_args)
> > @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> > err = -EBADF;
> > break;
> > }
> > - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> > if (!node) {
> > err = -ENOMEM;
> > fput(file);
> > @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> > }
> > node->tag = tag;
> > }
> > - i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
> > + i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
>
> Looks like this change belongs in the prior patch "io_uring: add
> abstraction for buf_table rsrc data"?
>
> > io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
> > ctx->buf_table.data.nodes[i] = node;
> > if (ctx->compat)
> > @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> > case IORING_RSRC_BUFFER:
> > if (node->buf)
> > io_buffer_unmap(ctx, node);
> > + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> > + return;
> > break;
> > default:
> > WARN_ON_ONCE(1);
> > @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> > goto fail;
> > }
> > ret = -ENOMEM;
> > - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> > if (!node) {
> > fput(file);
> > goto fail;
> > @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> > return ret;
> > }
> >
> > +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> > + struct io_buf_table *table)
> > +{
> > + io_rsrc_data_free(ctx, &table->data);
> > + io_alloc_cache_free(&table->node_cache, kfree);
> > + io_alloc_cache_free(&table->imu_cache, kfree);
> > +}
> > +
> > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> > {
> > if (!ctx->buf_table.data.nr)
> > return -ENXIO;
> > - io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > + io_rsrc_buffer_free(ctx, &ctx->buf_table);
> > return 0;
> > }
> >
> > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> > return true;
> > }
> >
> > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > + int nr_bvecs)
> > +{
> > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
>
> If there is no entry available in the cache, this will heap-allocate
> one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> using io_alloc_cache_get() instead of io_cache_alloc(), so the
> heap-allocated fallback uses the minimal size.
>
> Also, where are these allocations returned to the imu_cache? Looks
> like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> needs to try io_alloc_cache_put() first.
Another issue I see is that io_alloc_cache elements are allocated with
kmalloc(), so they can't be freed with kvfree(). When the imu is
freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().
>
> Best,
> Caleb
>
> > + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> > + GFP_KERNEL);
> > +}
> > +
> > static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > struct iovec *iov,
> > struct page **last_hpage)
> > @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > if (!iov->iov_base)
> > return NULL;
> >
> > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> > if (!node)
> > return ERR_PTR(-ENOMEM);
> > node->buf = NULL;
> > @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
> > }
> >
> > - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> > + imu = io_alloc_imu(ctx, nr_pages);
> > if (!imu)
> > goto done;
> >
> > @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> > unsigned int nr_args, u64 __user *tags)
> > {
> > struct page *last_hpage = NULL;
> > - struct io_rsrc_data data;
> > struct iovec fast_iov, *iov = &fast_iov;
> > const struct iovec __user *uvec;
> > + struct io_buf_table table;
> > int i, ret;
> >
> > BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> > @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> > return -EBUSY;
> > if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
> > return -EINVAL;
> > - ret = io_rsrc_data_alloc(&data, nr_args);
> > + ret = io_rsrc_buffer_alloc(&table, nr_args);
> > if (ret)
> > return ret;
> >
> > if (!arg)
> > memset(iov, 0, sizeof(*iov));
> >
> > + ctx->buf_table = table;
> > for (i = 0; i < nr_args; i++) {
> > struct io_rsrc_node *node;
> > u64 tag = 0;
> > @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> > }
> > node->tag = tag;
> > }
> > - data.nodes[i] = node;
> > + table.data.nodes[i] = node;
> > }
> > -
> > - ctx->buf_table.data = data;
>
> Is it necessary to move this assignment? I found the existing location
> easier to reason about, since the assignment of ctx->buf_table
> represents a transfer of ownership from the local variable.
>
> > if (ret)
> > io_sqe_buffers_unregister(ctx);
> > return ret;
> > @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> > goto unlock;
> > }
> >
> > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> > if (!node) {
> > ret = -ENOMEM;
> > goto unlock;
> > }
> >
> > nr_bvecs = blk_rq_nr_phys_segments(rq);
> > - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> > +
> > + imu = io_alloc_imu(ctx, nr_bvecs);
> > if (!imu) {
> > kfree(node);
> > ret = -ENOMEM;
> > @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
> > static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> > struct io_uring_clone_buffers *arg)
> > {
> > - struct io_rsrc_data data;
> > + struct io_buf_table table;
> > int i, ret, off, nr;
> > unsigned int nbufs;
> >
> > @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
> > return -EOVERFLOW;
> >
> > - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> > + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
> > if (ret)
> > return ret;
> >
> > @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
> >
> > if (src_node) {
> > - data.nodes[i] = src_node;
> > + table.data.nodes[i] = src_node;
> > src_node->refs++;
> > }
> > }
> > @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > if (!src_node) {
> > dst_node = NULL;
> > } else {
> > - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> > if (!dst_node) {
> > ret = -ENOMEM;
> > goto out_free;
> > @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > refcount_inc(&src_node->buf->refs);
> > dst_node->buf = src_node->buf;
> > }
> > - data.nodes[off++] = dst_node;
> > + table.data.nodes[off++] = dst_node;
> > i++;
> > }
> >
> > /*
> > - * If asked for replace, put the old table. data->nodes[] holds both
> > + * If asked for replace, put the old table. table.data->nodes[] holds both
> > * old and new nodes at this point.
> > */
> > if (arg->flags & IORING_REGISTER_DST_REPLACE)
> > @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > * entry).
> > */
> > WARN_ON_ONCE(ctx->buf_table.data.nr);
> > - ctx->buf_table.data = data;
> > + ctx->buf_table = table;
> > return 0;
> > out_free:
> > - io_rsrc_data_free(ctx, &data);
> > + io_rsrc_buffer_free(ctx, &table);
> > return ret;
> > }
> >
> > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> > index 2e8d1862caefc..c5bdac558a2b4 100644
> > --- a/io_uring/rsrc.h
> > +++ b/io_uring/rsrc.h
> > @@ -47,7 +47,7 @@ struct io_imu_folio_data {
> > unsigned int nr_folios;
> > };
> >
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
> > void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
> > void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
> > int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> > --
> > 2.43.5
> >
> >
next prev parent reply other threads:[~2025-02-16 22:43 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-14 15:43 [PATCHv3 0/5] ublk zero-copy support Keith Busch
2025-02-14 15:43 ` [PATCHv3 1/5] io_uring: move fixed buffer import to issue path Keith Busch
2025-02-18 20:32 ` Caleb Sander Mateos
2025-02-14 15:43 ` [PATCHv3 2/5] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-14 20:38 ` Caleb Sander Mateos
2025-02-18 19:59 ` Keith Busch
2025-02-18 20:20 ` Caleb Sander Mateos
2025-02-14 15:43 ` [PATCHv3 3/5] ublk: zc register/unregister bvec Keith Busch
2025-02-14 15:43 ` [PATCHv3 4/5] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-14 15:43 ` [PATCHv3 5/5] io_uring: cache nodes and mapped buffers Keith Busch
2025-02-15 2:22 ` Caleb Sander Mateos
2025-02-16 22:43 ` Caleb Sander Mateos [this message]
2025-02-18 20:12 ` Keith Busch
2025-02-18 20:45 ` Caleb Sander Mateos
2025-02-18 20:09 ` Keith Busch
2025-02-18 20:42 ` Caleb Sander Mateos
2025-02-18 21:12 ` Keith Busch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CADUfDZrfmpy3hxD5z0ADxCUkWPbU0aZDMiqpyPE+AZbeDSgZ=g@mail.gmail.com' \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox