From: Caleb Sander Mateos <[email protected]>
To: Keith Busch <[email protected]>
Cc: [email protected], [email protected], [email protected],
[email protected], [email protected],
[email protected], Keith Busch <[email protected]>
Subject: Re: [PATCHv4 5/5] io_uring: cache nodes and mapped buffers
Date: Tue, 18 Feb 2025 20:22:36 -0800 [thread overview]
Message-ID: <CADUfDZq5CDOZyyfjOgW_JE_A_GWLscZkbJDgQ-UKTbFC66FjKA@mail.gmail.com> (raw)
In-Reply-To: <[email protected]>
On Tue, Feb 18, 2025 at 2:43 PM Keith Busch <[email protected]> wrote:
>
> From: Keith Busch <[email protected]>
>
> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> more efficiently reuse these buffers.
>
> Signed-off-by: Keith Busch <[email protected]>
> ---
> include/linux/io_uring_types.h | 18 ++---
> io_uring/filetable.c | 2 +-
> io_uring/rsrc.c | 120 +++++++++++++++++++++++++--------
> io_uring/rsrc.h | 2 +-
> 4 files changed, 104 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 810d1dccd27b1..bbfb651506522 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -69,8 +69,18 @@ struct io_file_table {
> unsigned int alloc_hint;
> };
>
> +struct io_alloc_cache {
> + void **entries;
> + unsigned int nr_cached;
> + unsigned int max_cached;
> + size_t elem_size;
Is growing this field from unsigned to size_t really necessary? It
probably doesn't make sense to be caching allocations > 4 GB.
> + unsigned int init_clear;
> +};
> +
> struct io_buf_table {
> struct io_rsrc_data data;
> + struct io_alloc_cache node_cache;
> + struct io_alloc_cache imu_cache;
> };
>
> struct io_hash_bucket {
> @@ -224,14 +234,6 @@ struct io_submit_state {
> struct blk_plug plug;
> };
>
> -struct io_alloc_cache {
> - void **entries;
> - unsigned int nr_cached;
> - unsigned int max_cached;
> - unsigned int elem_size;
> - unsigned int init_clear;
> -};
> -
> struct io_ring_ctx {
> /* const or read-mostly hot data */
> struct {
> diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> index dd8eeec97acf6..a21660e3145ab 100644
> --- a/io_uring/filetable.c
> +++ b/io_uring/filetable.c
> @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
> if (slot_index >= ctx->file_table.data.nr)
> return -EINVAL;
>
> - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> if (!node)
> return -ENOMEM;
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 261b5535f46c6..d5cac3a234316 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> #define IORING_MAX_FIXED_FILES (1U << 20)
> #define IORING_MAX_REG_BUFFERS (1U << 14)
>
> +#define IO_CACHED_BVECS_SEGS 32
> +
> int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
> {
> unsigned long page_limit, cur_pages, new_pages;
> @@ -101,6 +103,22 @@ int io_buffer_validate(struct iovec *iov)
> return 0;
> }
>
> +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> + int nr_bvecs)
Pass nr_bvecs as unsigned to avoid sign-extension in struct_size_t()?
> +{
> + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> + GFP_KERNEL);
> +}
> +
> +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
> +{
> + if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS ||
> + !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu))
> + kvfree(imu);
> +}
> +
> static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> {
> struct io_mapped_ubuf *imu = node->buf;
> @@ -119,22 +137,35 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> io_unaccount_mem(ctx, imu->acct_pages);
> }
>
> - kvfree(imu);
> + io_free_imu(ctx, imu);
> }
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
> {
> struct io_rsrc_node *node;
>
> - node = kzalloc(sizeof(*node), GFP_KERNEL);
> + if (type == IORING_RSRC_FILE)
> + node = kmalloc(sizeof(*node), GFP_KERNEL);
> + else
> + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
> if (node) {
> node->type = type;
> node->refs = 1;
> + node->tag = 0;
> + node->file_ptr = 0;
> }
> return node;
> }
>
> -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> +{
> + kvfree(data->nodes);
> + data->nodes = NULL;
> + data->nr = 0;
> +}
> +
> +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> + struct io_rsrc_data *data)
> {
> if (!data->nr)
> return;
> @@ -142,9 +173,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
> if (data->nodes[data->nr])
> io_put_rsrc_node(ctx, data->nodes[data->nr]);
> }
> - kvfree(data->nodes);
> - data->nodes = NULL;
> - data->nr = 0;
> + __io_rsrc_data_free(data);
> }
>
> __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> @@ -158,6 +187,31 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> return -ENOMEM;
> }
>
> +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> +{
> + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> + IO_CACHED_BVECS_SEGS);
> + const int node_size = sizeof(struct io_rsrc_node);
> + int ret;
> +
> + ret = io_rsrc_data_alloc(&table->data, nr);
> + if (ret)
> + return ret;
> +
> + if (io_alloc_cache_init(&table->node_cache, nr, node_size, 0))
> + goto free_data;
> +
> + if (io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0))
> + goto free_cache;
> +
> + return 0;
> +free_cache:
> + io_alloc_cache_free(&table->node_cache, kfree);
> +free_data:
> + __io_rsrc_data_free(&table->data);
> + return -ENOMEM;
> +}
> +
> static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> struct io_uring_rsrc_update2 *up,
> unsigned nr_args)
> @@ -207,7 +261,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> err = -EBADF;
> break;
> }
> - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> if (!node) {
> err = -ENOMEM;
> fput(file);
> @@ -459,6 +513,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> case IORING_RSRC_BUFFER:
> if (node->buf)
> io_buffer_unmap(ctx, node);
> + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> + return;
> break;
> default:
> WARN_ON_ONCE(1);
> @@ -527,7 +583,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> goto fail;
> }
> ret = -ENOMEM;
> - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> if (!node) {
> fput(file);
> goto fail;
> @@ -547,11 +603,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> return ret;
> }
>
> +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> + struct io_buf_table *table)
> +{
> + io_rsrc_data_free(ctx, &table->data);
> + io_alloc_cache_free(&table->node_cache, kfree);
> + io_alloc_cache_free(&table->imu_cache, kfree);
> +}
> +
> int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> {
> if (!ctx->buf_table.data.nr)
> return -ENXIO;
> - io_rsrc_data_free(ctx, &ctx->buf_table.data);
> + io_rsrc_buffer_free(ctx, &ctx->buf_table);
> return 0;
> }
>
> @@ -732,7 +796,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> if (!iov->iov_base)
> return NULL;
>
> - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> if (!node)
> return ERR_PTR(-ENOMEM);
> node->buf = NULL;
> @@ -752,7 +816,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
> }
>
> - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> + imu = io_alloc_imu(ctx, nr_pages);
> if (!imu)
> goto done;
>
> @@ -789,7 +853,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> }
> done:
> if (ret) {
> - kvfree(imu);
> + io_free_imu(ctx, imu);
> if (node)
> io_put_rsrc_node(ctx, node);
> node = ERR_PTR(ret);
> @@ -802,9 +866,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> unsigned int nr_args, u64 __user *tags)
> {
> struct page *last_hpage = NULL;
> - struct io_rsrc_data data;
> struct iovec fast_iov, *iov = &fast_iov;
> const struct iovec __user *uvec;
> + struct io_buf_table table;
> int i, ret;
>
> BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> @@ -813,13 +877,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> return -EBUSY;
> if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
> return -EINVAL;
> - ret = io_rsrc_data_alloc(&data, nr_args);
> + ret = io_rsrc_buffer_alloc(&table, nr_args);
> if (ret)
> return ret;
>
> if (!arg)
> memset(iov, 0, sizeof(*iov));
>
> + ctx->buf_table = table;
> for (i = 0; i < nr_args; i++) {
> struct io_rsrc_node *node;
> u64 tag = 0;
> @@ -859,10 +924,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> }
> node->tag = tag;
> }
> - data.nodes[i] = node;
> + table.data.nodes[i] = node;
> }
> -
> - ctx->buf_table.data = data;
Still don't see the need to move this assignment. Is there a reason
you prefer setting ctx->buf_table before initializing its nodes? I
find the existing code easier to follow, where the table is moved to
ctx->buf_table after filling it in. It's also consistent with
io_clone_buffers().
> if (ret)
> io_sqe_buffers_unregister(ctx);
> return ret;
> @@ -887,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> goto unlock;
> }
>
> - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> if (!node) {
> ret = -ENOMEM;
> goto unlock;
> }
>
> nr_bvecs = blk_rq_nr_phys_segments(rq);
> - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> +
> + imu = io_alloc_imu(ctx, nr_bvecs);
nit: probably don't need to add a blank line here
Best,
Caleb
> if (!imu) {
> kfree(node);
> ret = -ENOMEM;
> @@ -1031,7 +1095,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
> static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> struct io_uring_clone_buffers *arg)
> {
> - struct io_rsrc_data data;
> + struct io_buf_table table;
> int i, ret, off, nr;
> unsigned int nbufs;
>
> @@ -1062,7 +1126,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
> return -EOVERFLOW;
>
> - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
> if (ret)
> return ret;
>
> @@ -1071,7 +1135,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
>
> if (src_node) {
> - data.nodes[i] = src_node;
> + table.data.nodes[i] = src_node;
> src_node->refs++;
> }
> }
> @@ -1101,7 +1165,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> if (!src_node) {
> dst_node = NULL;
> } else {
> - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> if (!dst_node) {
> ret = -ENOMEM;
> goto out_free;
> @@ -1110,12 +1174,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> refcount_inc(&src_node->buf->refs);
> dst_node->buf = src_node->buf;
> }
> - data.nodes[off++] = dst_node;
> + table.data.nodes[off++] = dst_node;
> i++;
> }
>
> /*
> - * If asked for replace, put the old table. data->nodes[] holds both
> + * If asked for replace, put the old table. table.data->nodes[] holds both
> * old and new nodes at this point.
> */
> if (arg->flags & IORING_REGISTER_DST_REPLACE)
> @@ -1128,10 +1192,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> * entry).
> */
> WARN_ON_ONCE(ctx->buf_table.data.nr);
> - ctx->buf_table.data = data;
> + ctx->buf_table = table;
> return 0;
> out_free:
> - io_rsrc_data_free(ctx, &data);
> + io_rsrc_buffer_free(ctx, &table);
> return ret;
> }
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 81c31b93d4f7b..69cc212ad0c7c 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -49,7 +49,7 @@ struct io_imu_folio_data {
> unsigned int nr_folios;
> };
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
> void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
> void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
> int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> --
> 2.43.5
>
next prev parent reply other threads:[~2025-02-19 4:22 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-18 22:42 [PATCHv4 0/5] ublk zero-copy support Keith Busch
2025-02-18 22:42 ` [PATCHv4 1/5] io_uring: move fixed buffer import to issue path Keith Busch
2025-02-19 1:27 ` Caleb Sander Mateos
2025-02-19 4:23 ` Ming Lei
2025-02-19 16:48 ` Pavel Begunkov
2025-02-19 17:15 ` Pavel Begunkov
2025-02-20 1:25 ` Keith Busch
2025-02-20 10:12 ` Pavel Begunkov
2025-02-18 22:42 ` [PATCHv4 2/5] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-19 1:54 ` Caleb Sander Mateos
2025-02-19 17:23 ` Pavel Begunkov
2025-02-20 10:31 ` Pavel Begunkov
2025-02-20 10:38 ` Pavel Begunkov
2025-02-18 22:42 ` [PATCHv4 3/5] ublk: zc register/unregister bvec Keith Busch
2025-02-19 2:36 ` Caleb Sander Mateos
2025-02-20 11:11 ` Pavel Begunkov
2025-02-24 21:02 ` Keith Busch
2025-02-18 22:42 ` [PATCHv4 4/5] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-19 3:04 ` Caleb Sander Mateos
2025-02-18 22:42 ` [PATCHv4 5/5] io_uring: cache nodes and mapped buffers Keith Busch
2025-02-19 4:22 ` Caleb Sander Mateos [this message]
2025-02-24 21:01 ` Keith Busch
2025-02-24 21:39 ` Caleb Sander Mateos
2025-02-20 11:08 ` Pavel Begunkov
2025-02-20 15:24 ` Keith Busch
2025-02-20 16:06 ` Pavel Begunkov
2025-02-24 21:04 ` Keith Busch
2025-02-25 13:06 ` Pavel Begunkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CADUfDZq5CDOZyyfjOgW_JE_A_GWLscZkbJDgQ-UKTbFC66FjKA@mail.gmail.com \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox