From: "Clément Léger" <cleger@meta.com>
To: Jens Axboe <axboe@kernel.dk>, io-uring <io-uring@vger.kernel.org>
Subject: Re: [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers
Date: Mon, 4 May 2026 09:31:49 +0200 [thread overview]
Message-ID: <f2e104e2-0110-4dca-a285-81b0fc63a272@meta.com> (raw)
In-Reply-To: <d377141e-064e-48a2-9a76-8477a90e8655@kernel.dk>
On 5/2/26 04:31, Jens Axboe wrote:
> >
> Track huge page references in a per-ring xarray to prevent double
> accounting when the same huge page is used by multiple registered
> buffers, either within the same ring or across cloned rings.
>
> When registering buffers backed by huge pages, we need to account for
> RLIMIT_MEMLOCK. But if multiple buffers share the same huge page (common
> with cloned buffers), we must not account for the same page multiple
> times. Similarly, we must only unaccount when the last reference to a
> huge page is released.
>
> Maintain a per-ring xarray (hpage_acct) that tracks reference counts for
> each huge page. When registering a buffer, for each unique huge page,
> increment its accounting reference count, and only account pages that
> are newly added.
>
> When unregistering a buffer, for each unique huge page, decrement its
> refcount. Once the refcount hits zero, the page is unaccounted.
>
> Note: any account is done against the ctx->user that was assigned when
> the ring was setup. As before, if root is running the operation, no
> accounting is done.
>
> With these changes, any use of imu->acct_pages is also dead, hence kill
> it from struct io_mapped_ubuf. This shrinks it from 56b to 48b on a
> 64-bit arch. Additionally, hpage_already_acct() is gone, which was an
> O(M*M) scan over current + previous registrations.
>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>
> ---
>
> See previous discussions here:
>
> https://lore.kernel.org/io-uring/20260119071039.2113739-1-danisjiang@gmail.com/
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 244392026c6d..23b8891d5704 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -446,6 +446,9 @@ struct io_ring_ctx {
> /* Stores zcrx object pointers of type struct io_zcrx_ifq */
> struct xarray zcrx_ctxs;
>
> + /* Used for accounting references on pages in registered buffers */
> + struct xarray hpage_acct;
> +
> u32 pers_next;
> struct xarray personalities;
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 4ed998d60c09..fb6ed52bae61 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -233,6 +233,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> return NULL;
>
> xa_init(&ctx->io_bl_xa);
> + xa_init(&ctx->hpage_acct);
>
> /*
> * Use 5 bits less than the max cq entries, that should give us around
> @@ -302,6 +303,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> io_free_alloc_caches(ctx);
> kvfree(ctx->cancel_table.hbs);
> xa_destroy(&ctx->io_bl_xa);
> + xa_destroy(&ctx->hpage_acct);
> kfree(ctx);
> return NULL;
> }
> @@ -2198,6 +2200,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
> io_napi_free(ctx);
> kvfree(ctx->cancel_table.hbs);
> xa_destroy(&ctx->io_bl_xa);
> + xa_destroy(&ctx->hpage_acct);
> kfree(ctx);
> }
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 650303626be6..ca22e07245c4 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -28,7 +28,51 @@ struct io_rsrc_update {
> };
>
> static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> - struct iovec *iov, struct page **last_hpage);
> + struct iovec *iov);
> +
> +static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage,
> + bool *acct_new)
> +{
> + unsigned long key = (unsigned long) hpage;
> + unsigned long count;
> + void *entry;
> + int ret;
> +
> + lockdep_assert_held(&ctx->uring_lock);
> +
> + entry = xa_load(&ctx->hpage_acct, key);
> + if (!entry) {
> + ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
> + if (ret)
> + return ret;
> + }
> +
> + count = 1;
> + if (entry)
> + count = xa_to_value(entry) + 1;
Hi Jens,
Can't most of this be merged in the previous if/else ? ie:
entry = xa_load(&ctx->hpage_acct, key);>
count = 1;
if (!entry) {
ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
if (ret)
return ret;
*acct_new = true;
} else {
count = xa_to_value(entry) + 1;
*acct_new = false;
}
> + xa_store(&ctx->hpage_acct, key, xa_mk_value(count), GFP_KERNEL_ACCOUNT);
> + *acct_new = (count == 1);
> + return 0;
> +}
> +
> +static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage)
> +{
> + unsigned long key = (unsigned long) hpage;
> + unsigned long count;
> + void *entry;
> +
> + lockdep_assert_held(&ctx->uring_lock);
> +
> + entry = xa_load(&ctx->hpage_acct, key);
> + if (WARN_ON_ONCE(!entry))
> + return false;
> + count = xa_to_value(entry);
> + if (count == 1)
> + xa_erase(&ctx->hpage_acct, key);
> + else
> + xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT);
> + return count == 1;
Maybe something like this could easier to read ?:
if (count == 1) {
xa_erase(&ctx->hpage_acct, key);
return true;
}
xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1),
GFP_KERNEL_ACCOUNT);
return false;
> +}
>
> /* only define max */
> #define IORING_MAX_FIXED_FILES (1U << 20)
> @@ -124,15 +168,53 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
> kvfree(imu);
> }
>
> +static unsigned long io_buffer_unaccount_pages(struct io_ring_ctx *ctx,
> + struct io_mapped_ubuf *imu)
> +{
> + struct page *seen = NULL;
> + unsigned long acct = 0;
> + int i;
> +
> + if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
> + return 0;
> +
> + for (i = 0; i < imu->nr_bvecs; i++) {
> + struct page *page = imu->bvec[i].bv_page;
> + struct page *hpage;
> +
> + if (!PageCompound(page)) {
> + acct++;
> + continue;
> + }
> +
> + hpage = compound_head(page);
> + if (hpage == seen)
> + continue;
> + seen = hpage;
> +
> + /* Unaccount on last reference */
> + if (hpage_acct_unref(ctx, hpage))
> + acct += page_size(hpage) >> PAGE_SHIFT;
> + cond_resched();
> + }
> +
> + return acct;
> +}
> +
> static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
> {
> + unsigned long acct_pages = 0;
> +
> + /* Always decrement, so it works for cloned buffers too */
> + acct_pages = io_buffer_unaccount_pages(ctx, imu);
> +
> if (unlikely(refcount_read(&imu->refs) > 1)) {
> if (!refcount_dec_and_test(&imu->refs))
> return;
> }
>
> - if (imu->acct_pages)
> - io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
> + if (acct_pages)
> + io_unaccount_mem(ctx->user, ctx->mm_account, acct_pages);
> imu->release(imu->priv);
> io_free_imu(ctx, imu);
> }
> @@ -282,7 +364,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> {
> u64 __user *tags = u64_to_user_ptr(up->tags);
> struct iovec fast_iov, *iov;
> - struct page *last_hpage = NULL;
> struct iovec __user *uvec;
> u64 user_data = up->data;
> __u32 done;
> @@ -307,7 +388,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> err = -EFAULT;
> break;
> }
> - node = io_sqe_buffer_register(ctx, iov, &last_hpage);
> + node = io_sqe_buffer_register(ctx, iov);
> if (IS_ERR(node)) {
> err = PTR_ERR(node);
> break;
> @@ -605,76 +686,79 @@ int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> }
>
> /*
> - * Not super efficient, but this is just a registration time. And we do cache
> - * the last compound head, so generally we'll only do a full search if we don't
> - * match that one.
> - *
> - * We check if the given compound head page has already been accounted, to
> - * avoid double accounting it. This allows us to account the full size of the
> - * page, not just the constituent pages of a huge page.
> + * Undo hpage_acct_ref() calls made during io_buffer_account_pin() on failure.
> + * This operates on the pages array since imu->bvec isn't populated yet.
> */
> -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
> - int nr_pages, struct page *hpage)
> +static void io_buffer_unaccount_hpages(struct io_ring_ctx *ctx,
> + struct page **pages, int nr_pages)
> {
> - int i, j;
> + struct page *seen = NULL;
> + int i;
> +
> + if (!ctx->user)
> + return;
>
> - /* check current page array */
> for (i = 0; i < nr_pages; i++) {
> + struct page *hpage;
> +
> if (!PageCompound(pages[i]))
> continue;
> - if (compound_head(pages[i]) == hpage)
> - return true;
> - }
> -
> - /* check previously registered pages */
> - for (i = 0; i < ctx->buf_table.nr; i++) {
> - struct io_rsrc_node *node = ctx->buf_table.nodes[i];
> - struct io_mapped_ubuf *imu;
>
> - if (!node)
> + hpage = compound_head(pages[i]);
> + if (hpage == seen)
> continue;
> - imu = node->buf;
> - for (j = 0; j < imu->nr_bvecs; j++) {
> - if (!PageCompound(imu->bvec[j].bv_page))
> - continue;
> - if (compound_head(imu->bvec[j].bv_page) == hpage)
> - return true;
> - }
> - }
> + seen = hpage;
>
> - return false;
> + hpage_acct_unref(ctx, hpage);
> + cond_resched();
> + }
> }
>
> static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
> - int nr_pages, struct io_mapped_ubuf *imu,
> - struct page **last_hpage)
> + int nr_pages)
> {
> + unsigned long acct_pages = 0;
> + struct page *seen = NULL;
> int i, ret;
>
> - imu->acct_pages = 0;
> + if (!ctx->user)
> + return 0;
> +
> for (i = 0; i < nr_pages; i++) {
> + struct page *hpage;
> + bool acct_new;
> +
> if (!PageCompound(pages[i])) {
> - imu->acct_pages++;
> - } else {
> - struct page *hpage;
> -
> - hpage = compound_head(pages[i]);
> - if (hpage == *last_hpage)
> - continue;
> - *last_hpage = hpage;
> - if (headpage_already_acct(ctx, pages, i, hpage))
> - continue;
> - imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
> + acct_pages++;
> + continue;
> }
> +
> + hpage = compound_head(pages[i]);
> + if (hpage == seen)
> + continue;
> + seen = hpage;
> +
> + ret = hpage_acct_ref(ctx, hpage, &acct_new);
> + if (ret) {
> + io_buffer_unaccount_hpages(ctx, pages, i);
> + return ret;
> + }
> + if (acct_new)
> + acct_pages += page_size(hpage) >> PAGE_SHIFT;
> + cond_resched();
> }
>
> - if (!imu->acct_pages)
> - return 0;
> + /* Try to account the memory */
> + if (acct_pages) {
> + ret = io_account_mem(ctx->user, ctx->mm_account, acct_pages);
> + if (ret) {
> + /* Undo the refs we just added */
> + io_buffer_unaccount_hpages(ctx, pages, nr_pages);
> + return ret;
> + }
> + }
>
> - ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
> - if (ret)
> - imu->acct_pages = 0;
> - return ret;
> + return 0;
> }
>
> static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
> @@ -763,8 +847,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> }
>
> static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> - struct iovec *iov,
> - struct page **last_hpage)
> + struct iovec *iov)
> {
> struct io_mapped_ubuf *imu = NULL;
> struct page **pages = NULL;
> @@ -811,7 +894,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> goto done;
>
> imu->nr_bvecs = nr_pages;
> - ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
> + ret = io_buffer_account_pin(ctx, pages, nr_pages);
> if (ret)
> goto done;
>
> @@ -861,7 +944,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> unsigned int nr_args, u64 __user *tags)
> {
> - struct page *last_hpage = NULL;
> struct io_rsrc_data data;
> struct iovec fast_iov, *iov = &fast_iov;
> const struct iovec __user *uvec;
> @@ -904,7 +986,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> }
> }
>
> - node = io_sqe_buffer_register(ctx, iov, &last_hpage);
> + node = io_sqe_buffer_register(ctx, iov);
> if (IS_ERR(node)) {
> ret = PTR_ERR(node);
> break;
> @@ -971,7 +1053,6 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
>
> imu->ubuf = 0;
> imu->len = blk_rq_bytes(rq);
> - imu->acct_pages = 0;
> imu->folio_shift = PAGE_SHIFT;
> refcount_set(&imu->refs, 1);
> imu->release = release;
> @@ -1137,6 +1218,56 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
> }
>
> /* Lock two rings at once. The rings must be different! */
This comment should be before lock_two_rings().
> +static int io_buffer_acct_cloned_hpages(struct io_ring_ctx *ctx,
> + struct io_mapped_ubuf *imu)
> +{
> + struct page *seen = NULL;
> + int i, ret = 0;
> +
> + if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
> + return 0;
> +
> + for (i = 0; i < imu->nr_bvecs; i++) {
> + struct page *page = imu->bvec[i].bv_page;
> + struct page *hpage;
> + bool acct_new;
> +
> + if (!PageCompound(page))
> + continue;
> +
> + hpage = compound_head(page);
> + if (hpage == seen)
> + continue;
> + seen = hpage;
> +
> + /* Atomically add reference for cloned buffer */
> + ret = hpage_acct_ref(ctx, hpage, &acct_new);
> + if (ret)
> + break;
> +
> + cond_resched();
> + }
> +
> + if (!ret)
> + return 0;
> +
> + /* Undo refs we added for bvecs [0..i) */
> + seen = NULL;
> + for (int j = 0; j < i; j++) {
> + struct page *p = imu->bvec[j].bv_page;
> + struct page *hp;
> +
> + if (!PageCompound(p))
> + continue;
> + hp = compound_head(p);
> + if (hp == seen)
> + continue;
> + seen = hp;
> + hpage_acct_unref(ctx, hp);
> + }
> + return ret;
> +}
> +
> static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
> {
> if (ctx1 > ctx2)
> @@ -1218,6 +1349,14 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>
> refcount_inc(&src_node->buf->refs);
> dst_node->buf = src_node->buf;
> + /* track compound references to clones */
> + ret = io_buffer_acct_cloned_hpages(ctx, src_node->buf);
> + if (ret) {
> + refcount_dec(&src_node->buf->refs);
> + io_cache_free(&ctx->node_cache, dst_node);
> + io_rsrc_data_free(ctx, &data);
> + return ret;
> + }
> }
> data.nodes[off++] = dst_node;
> i++;
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 44e3386f7c1c..c0f8a18ec767 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -38,7 +38,6 @@ struct io_mapped_ubuf {
> unsigned int nr_bvecs;
> unsigned int folio_shift;
> refcount_t refs;
> - unsigned long acct_pages;
> void (*release)(void *);
> void *priv;
> u8 flags;
>
Thanks,
Clément
next prev parent reply other threads:[~2026-05-04 7:31 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-02 2:31 [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers Jens Axboe
2026-05-04 7:31 ` Clément Léger [this message]
2026-05-04 7:36 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=f2e104e2-0110-4dca-a285-81b0fc63a272@meta.com \
--to=cleger@meta.com \
--cc=axboe@kernel.dk \
--cc=io-uring@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox