[PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers

public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers
@ 2026-05-02  2:31 Jens Axboe
  2026-05-04  7:31 ` Clément Léger
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2026-05-02  2:31 UTC (permalink / raw)
  To: io-uring

Track huge page references in a per-ring xarray to prevent double
accounting when the same huge page is used by multiple registered
buffers, either within the same ring or across cloned rings.

When registering buffers backed by huge pages, we need to account for
RLIMIT_MEMLOCK. But if multiple buffers share the same huge page (common
with cloned buffers), we must not account for the same page multiple
times. Similarly, we must only unaccount when the last reference to a
huge page is released.

Maintain a per-ring xarray (hpage_acct) that tracks reference counts for
each huge page. When registering a buffer, for each unique huge page,
increment its accounting reference count, and only account pages that
are newly added.

When unregistering a buffer, for each unique huge page, decrement its
refcount. Once the refcount hits zero, the page is unaccounted.

Note: any account is done against the ctx->user that was assigned when
the ring was setup. As before, if root is running the operation, no
accounting is done.

With these changes, any use of imu->acct_pages is also dead, hence kill
it from struct io_mapped_ubuf. This shrinks it from 56b to 48b on a
64-bit arch. Additionally, hpage_already_acct() is gone, which was an
O(M*M) scan over current + previous registrations.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

See previous discussions here:

https://lore.kernel.org/io-uring/20260119071039.2113739-1-danisjiang@gmail.com/

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 244392026c6d..23b8891d5704 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -446,6 +446,9 @@ struct io_ring_ctx {
 	/* Stores zcrx object pointers of type struct io_zcrx_ifq */
 	struct xarray			zcrx_ctxs;
 
+	/* Used for accounting references on pages in registered buffers */
+	struct xarray		hpage_acct;
+
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4ed998d60c09..fb6ed52bae61 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -233,6 +233,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		return NULL;
 
 	xa_init(&ctx->io_bl_xa);
+	xa_init(&ctx->hpage_acct);
 
 	/*
 	 * Use 5 bits less than the max cq entries, that should give us around
@@ -302,6 +303,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	io_free_alloc_caches(ctx);
 	kvfree(ctx->cancel_table.hbs);
 	xa_destroy(&ctx->io_bl_xa);
+	xa_destroy(&ctx->hpage_acct);
 	kfree(ctx);
 	return NULL;
 }
@@ -2198,6 +2200,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_napi_free(ctx);
 	kvfree(ctx->cancel_table.hbs);
 	xa_destroy(&ctx->io_bl_xa);
+	xa_destroy(&ctx->hpage_acct);
 	kfree(ctx);
 }
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 650303626be6..ca22e07245c4 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -28,7 +28,51 @@ struct io_rsrc_update {
 };
 
 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
-			struct iovec *iov, struct page **last_hpage);
+						   struct iovec *iov);
+
+static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage,
+			  bool *acct_new)
+{
+	unsigned long key = (unsigned long) hpage;
+	unsigned long count;
+	void *entry;
+	int ret;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	entry = xa_load(&ctx->hpage_acct, key);
+	if (!entry) {
+		ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
+		if (ret)
+			return ret;
+	}
+
+	count = 1;
+	if (entry)
+		count = xa_to_value(entry) + 1;
+	xa_store(&ctx->hpage_acct, key, xa_mk_value(count), GFP_KERNEL_ACCOUNT);
+	*acct_new = (count == 1);
+	return 0;
+}
+
+static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage)
+{
+	unsigned long key = (unsigned long) hpage;
+	unsigned long count;
+	void *entry;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	entry = xa_load(&ctx->hpage_acct, key);
+	if (WARN_ON_ONCE(!entry))
+		return false;
+	count = xa_to_value(entry);
+	if (count == 1)
+		xa_erase(&ctx->hpage_acct, key);
+	else
+		xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT);
+	return count == 1;
+}
 
 /* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 20)
@@ -124,15 +168,53 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
 		kvfree(imu);
 }
 
+static unsigned long io_buffer_unaccount_pages(struct io_ring_ctx *ctx,
+					       struct io_mapped_ubuf *imu)
+{
+	struct page *seen = NULL;
+	unsigned long acct = 0;
+	int i;
+
+	if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
+		return 0;
+
+	for (i = 0; i < imu->nr_bvecs; i++) {
+		struct page *page = imu->bvec[i].bv_page;
+		struct page *hpage;
+
+		if (!PageCompound(page)) {
+			acct++;
+			continue;
+		}
+
+		hpage = compound_head(page);
+		if (hpage == seen)
+			continue;
+		seen = hpage;
+
+		/* Unaccount on last reference */
+		if (hpage_acct_unref(ctx, hpage))
+			acct += page_size(hpage) >> PAGE_SHIFT;
+		cond_resched();
+	}
+
+	return acct;
+}
+
 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
 {
+	unsigned long acct_pages = 0;
+
+	/* Always decrement, so it works for cloned buffers too */
+	acct_pages = io_buffer_unaccount_pages(ctx, imu);
+
 	if (unlikely(refcount_read(&imu->refs) > 1)) {
 		if (!refcount_dec_and_test(&imu->refs))
 			return;
 	}
 
-	if (imu->acct_pages)
-		io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
+	if (acct_pages)
+		io_unaccount_mem(ctx->user, ctx->mm_account, acct_pages);
 	imu->release(imu->priv);
 	io_free_imu(ctx, imu);
 }
@@ -282,7 +364,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 {
 	u64 __user *tags = u64_to_user_ptr(up->tags);
 	struct iovec fast_iov, *iov;
-	struct page *last_hpage = NULL;
 	struct iovec __user *uvec;
 	u64 user_data = up->data;
 	__u32 done;
@@ -307,7 +388,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EFAULT;
 			break;
 		}
-		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+		node = io_sqe_buffer_register(ctx, iov);
 		if (IS_ERR(node)) {
 			err = PTR_ERR(node);
 			break;
@@ -605,76 +686,79 @@ int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 }
 
 /*
- * Not super efficient, but this is just a registration time. And we do cache
- * the last compound head, so generally we'll only do a full search if we don't
- * match that one.
- *
- * We check if the given compound head page has already been accounted, to
- * avoid double accounting it. This allows us to account the full size of the
- * page, not just the constituent pages of a huge page.
+ * Undo hpage_acct_ref() calls made during io_buffer_account_pin() on failure.
+ * This operates on the pages array since imu->bvec isn't populated yet.
  */
-static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
-				  int nr_pages, struct page *hpage)
+static void io_buffer_unaccount_hpages(struct io_ring_ctx *ctx,
+				       struct page **pages, int nr_pages)
 {
-	int i, j;
+	struct page *seen = NULL;
+	int i;
+
+	if (!ctx->user)
+		return;
 
-	/* check current page array */
 	for (i = 0; i < nr_pages; i++) {
+		struct page *hpage;
+
 		if (!PageCompound(pages[i]))
 			continue;
-		if (compound_head(pages[i]) == hpage)
-			return true;
-	}
-
-	/* check previously registered pages */
-	for (i = 0; i < ctx->buf_table.nr; i++) {
-		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
-		struct io_mapped_ubuf *imu;
 
-		if (!node)
+		hpage = compound_head(pages[i]);
+		if (hpage == seen)
 			continue;
-		imu = node->buf;
-		for (j = 0; j < imu->nr_bvecs; j++) {
-			if (!PageCompound(imu->bvec[j].bv_page))
-				continue;
-			if (compound_head(imu->bvec[j].bv_page) == hpage)
-				return true;
-		}
-	}
+		seen = hpage;
 
-	return false;
+		hpage_acct_unref(ctx, hpage);
+		cond_resched();
+	}
 }
 
 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
-				 int nr_pages, struct io_mapped_ubuf *imu,
-				 struct page **last_hpage)
+				 int nr_pages)
 {
+	unsigned long acct_pages = 0;
+	struct page *seen = NULL;
 	int i, ret;
 
-	imu->acct_pages = 0;
+	if (!ctx->user)
+		return 0;
+
 	for (i = 0; i < nr_pages; i++) {
+		struct page *hpage;
+		bool acct_new;
+
 		if (!PageCompound(pages[i])) {
-			imu->acct_pages++;
-		} else {
-			struct page *hpage;
-
-			hpage = compound_head(pages[i]);
-			if (hpage == *last_hpage)
-				continue;
-			*last_hpage = hpage;
-			if (headpage_already_acct(ctx, pages, i, hpage))
-				continue;
-			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
+			acct_pages++;
+			continue;
 		}
+
+		hpage = compound_head(pages[i]);
+		if (hpage == seen)
+			continue;
+		seen = hpage;
+
+		ret = hpage_acct_ref(ctx, hpage, &acct_new);
+		if (ret) {
+			io_buffer_unaccount_hpages(ctx, pages, i);
+			return ret;
+		}
+		if (acct_new)
+			acct_pages += page_size(hpage) >> PAGE_SHIFT;
+		cond_resched();
 	}
 
-	if (!imu->acct_pages)
-		return 0;
+	/* Try to account the memory */
+	if (acct_pages) {
+		ret = io_account_mem(ctx->user, ctx->mm_account, acct_pages);
+		if (ret) {
+			/* Undo the refs we just added */
+			io_buffer_unaccount_hpages(ctx, pages, nr_pages);
+			return ret;
+		}
+	}
 
-	ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
-	if (ret)
-		imu->acct_pages = 0;
-	return ret;
+	return 0;
 }
 
 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
@@ -763,8 +847,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
 }
 
 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
-						   struct iovec *iov,
-						   struct page **last_hpage)
+						   struct iovec *iov)
 {
 	struct io_mapped_ubuf *imu = NULL;
 	struct page **pages = NULL;
@@ -811,7 +894,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 		goto done;
 
 	imu->nr_bvecs = nr_pages;
-	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
+	ret = io_buffer_account_pin(ctx, pages, nr_pages);
 	if (ret)
 		goto done;
 
@@ -861,7 +944,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags)
 {
-	struct page *last_hpage = NULL;
 	struct io_rsrc_data data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec;
@@ -904,7 +986,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			}
 		}
 
-		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+		node = io_sqe_buffer_register(ctx, iov);
 		if (IS_ERR(node)) {
 			ret = PTR_ERR(node);
 			break;
@@ -971,7 +1053,6 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
 
 	imu->ubuf = 0;
 	imu->len = blk_rq_bytes(rq);
-	imu->acct_pages = 0;
 	imu->folio_shift = PAGE_SHIFT;
 	refcount_set(&imu->refs, 1);
 	imu->release = release;
@@ -1137,6 +1218,56 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
 }
 
 /* Lock two rings at once. The rings must be different! */
+static int io_buffer_acct_cloned_hpages(struct io_ring_ctx *ctx,
+					struct io_mapped_ubuf *imu)
+{
+	struct page *seen = NULL;
+	int i, ret = 0;
+
+	if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
+		return 0;
+
+	for (i = 0; i < imu->nr_bvecs; i++) {
+		struct page *page = imu->bvec[i].bv_page;
+		struct page *hpage;
+		bool acct_new;
+
+		if (!PageCompound(page))
+			continue;
+
+		hpage = compound_head(page);
+		if (hpage == seen)
+			continue;
+		seen = hpage;
+
+		/* Atomically add reference for cloned buffer */
+		ret = hpage_acct_ref(ctx, hpage, &acct_new);
+		if (ret)
+			break;
+
+		cond_resched();
+	}
+
+	if (!ret)
+		return 0;
+
+	/* Undo refs we added for bvecs [0..i) */
+	seen = NULL;
+	for (int j = 0; j < i; j++) {
+		struct page *p = imu->bvec[j].bv_page;
+		struct page *hp;
+
+		if (!PageCompound(p))
+			continue;
+		hp = compound_head(p);
+		if (hp == seen)
+			continue;
+		seen = hp;
+		hpage_acct_unref(ctx, hp);
+	}
+	return ret;
+}
+
 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
 {
 	if (ctx1 > ctx2)
@@ -1218,6 +1349,14 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
+			/* track compound references to clones */
+			ret = io_buffer_acct_cloned_hpages(ctx, src_node->buf);
+			if (ret) {
+				refcount_dec(&src_node->buf->refs);
+				io_cache_free(&ctx->node_cache, dst_node);
+				io_rsrc_data_free(ctx, &data);
+				return ret;
+			}
 		}
 		data.nodes[off++] = dst_node;
 		i++;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 44e3386f7c1c..c0f8a18ec767 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -38,7 +38,6 @@ struct io_mapped_ubuf {
 	unsigned int	nr_bvecs;
 	unsigned int    folio_shift;
 	refcount_t	refs;
-	unsigned long	acct_pages;
 	void		(*release)(void *);
 	void		*priv;
 	u8		flags;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers
  2026-05-02  2:31 [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers Jens Axboe
@ 2026-05-04  7:31 ` Clément Léger
  2026-05-04  7:36   ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Clément Léger @ 2026-05-04  7:31 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 5/2/26 04:31, Jens Axboe wrote:
> > 
> Track huge page references in a per-ring xarray to prevent double
> accounting when the same huge page is used by multiple registered
> buffers, either within the same ring or across cloned rings.
> 
> When registering buffers backed by huge pages, we need to account for
> RLIMIT_MEMLOCK. But if multiple buffers share the same huge page (common
> with cloned buffers), we must not account for the same page multiple
> times. Similarly, we must only unaccount when the last reference to a
> huge page is released.
> 
> Maintain a per-ring xarray (hpage_acct) that tracks reference counts for
> each huge page. When registering a buffer, for each unique huge page,
> increment its accounting reference count, and only account pages that
> are newly added.
> 
> When unregistering a buffer, for each unique huge page, decrement its
> refcount. Once the refcount hits zero, the page is unaccounted.
> 
> Note: any account is done against the ctx->user that was assigned when
> the ring was setup. As before, if root is running the operation, no
> accounting is done.
> 
> With these changes, any use of imu->acct_pages is also dead, hence kill
> it from struct io_mapped_ubuf. This shrinks it from 56b to 48b on a
> 64-bit arch. Additionally, hpage_already_acct() is gone, which was an
> O(M*M) scan over current + previous registrations.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> ---
> 
> See previous discussions here:
> 
> https://lore.kernel.org/io-uring/20260119071039.2113739-1-danisjiang@gmail.com/
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 244392026c6d..23b8891d5704 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -446,6 +446,9 @@ struct io_ring_ctx {
>   	/* Stores zcrx object pointers of type struct io_zcrx_ifq */
>   	struct xarray			zcrx_ctxs;
>   
> +	/* Used for accounting references on pages in registered buffers */
> +	struct xarray		hpage_acct;
> +
>   	u32			pers_next;
>   	struct xarray		personalities;
>   
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 4ed998d60c09..fb6ed52bae61 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -233,6 +233,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   		return NULL;
>   
>   	xa_init(&ctx->io_bl_xa);
> +	xa_init(&ctx->hpage_acct);
>   
>   	/*
>   	 * Use 5 bits less than the max cq entries, that should give us around
> @@ -302,6 +303,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   	io_free_alloc_caches(ctx);
>   	kvfree(ctx->cancel_table.hbs);
>   	xa_destroy(&ctx->io_bl_xa);
> +	xa_destroy(&ctx->hpage_acct);
>   	kfree(ctx);
>   	return NULL;
>   }
> @@ -2198,6 +2200,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
>   	io_napi_free(ctx);
>   	kvfree(ctx->cancel_table.hbs);
>   	xa_destroy(&ctx->io_bl_xa);
> +	xa_destroy(&ctx->hpage_acct);
>   	kfree(ctx);
>   }
>   
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 650303626be6..ca22e07245c4 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -28,7 +28,51 @@ struct io_rsrc_update {
>   };
>   
>   static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> -			struct iovec *iov, struct page **last_hpage);
> +						   struct iovec *iov);
> +
> +static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage,
> +			  bool *acct_new)
> +{
> +	unsigned long key = (unsigned long) hpage;
> +	unsigned long count;
> +	void *entry;
> +	int ret;
> +
> +	lockdep_assert_held(&ctx->uring_lock);
> +
> +	entry = xa_load(&ctx->hpage_acct, key);
> +	if (!entry) {
> +		ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	count = 1;
> +	if (entry)
> +		count = xa_to_value(entry) + 1;

Hi Jens,

Can't most of this be merged in the previous if/else ? ie:

	entry = xa_load(&ctx->hpage_acct, key);>
	count = 1;
	if (!entry) {
		ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
		if (ret)
			return ret;
		*acct_new = true;
	} else {
		count = xa_to_value(entry) + 1;
		*acct_new = false;
	}

> +	xa_store(&ctx->hpage_acct, key, xa_mk_value(count), GFP_KERNEL_ACCOUNT);
> +	*acct_new = (count == 1);
> +	return 0;
> +}
> +
> +static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage)
> +{
> +	unsigned long key = (unsigned long) hpage;
> +	unsigned long count;
> +	void *entry;
> +
> +	lockdep_assert_held(&ctx->uring_lock);
> +
> +	entry = xa_load(&ctx->hpage_acct, key);
> +	if (WARN_ON_ONCE(!entry))
> +		return false;
> +	count = xa_to_value(entry);
> +	if (count == 1)
> +		xa_erase(&ctx->hpage_acct, key);
> +	else
> +		xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT);
> +	return count == 1;

Maybe something like this could easier to read ?:

	if (count == 1) {
		xa_erase(&ctx->hpage_acct, key);
		return true;
	}
	
	xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), 
GFP_KERNEL_ACCOUNT);
	return false;



> +}
>   
>   /* only define max */
>   #define IORING_MAX_FIXED_FILES	(1U << 20)
> @@ -124,15 +168,53 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
>   		kvfree(imu);
>   }
>   
> +static unsigned long io_buffer_unaccount_pages(struct io_ring_ctx *ctx,
> +					       struct io_mapped_ubuf *imu)
> +{
> +	struct page *seen = NULL;
> +	unsigned long acct = 0;
> +	int i;
> +
> +	if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
> +		return 0;
> +
> +	for (i = 0; i < imu->nr_bvecs; i++) {
> +		struct page *page = imu->bvec[i].bv_page;
> +		struct page *hpage;
> +
> +		if (!PageCompound(page)) {
> +			acct++;
> +			continue;
> +		}
> +
> +		hpage = compound_head(page);
> +		if (hpage == seen)
> +			continue;
> +		seen = hpage;
> +
> +		/* Unaccount on last reference */
> +		if (hpage_acct_unref(ctx, hpage))
> +			acct += page_size(hpage) >> PAGE_SHIFT;
> +		cond_resched();
> +	}
> +
> +	return acct;
> +}
> +
>   static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
>   {
> +	unsigned long acct_pages = 0;
> +
> +	/* Always decrement, so it works for cloned buffers too */
> +	acct_pages = io_buffer_unaccount_pages(ctx, imu);
> +
>   	if (unlikely(refcount_read(&imu->refs) > 1)) {
>   		if (!refcount_dec_and_test(&imu->refs))
>   			return;
>   	}
>   
> -	if (imu->acct_pages)
> -		io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
> +	if (acct_pages)
> +		io_unaccount_mem(ctx->user, ctx->mm_account, acct_pages);
>   	imu->release(imu->priv);
>   	io_free_imu(ctx, imu);
>   }
> @@ -282,7 +364,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>   {
>   	u64 __user *tags = u64_to_user_ptr(up->tags);
>   	struct iovec fast_iov, *iov;
> -	struct page *last_hpage = NULL;
>   	struct iovec __user *uvec;
>   	u64 user_data = up->data;
>   	__u32 done;
> @@ -307,7 +388,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>   			err = -EFAULT;
>   			break;
>   		}
> -		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
> +		node = io_sqe_buffer_register(ctx, iov);
>   		if (IS_ERR(node)) {
>   			err = PTR_ERR(node);
>   			break;
> @@ -605,76 +686,79 @@ int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>   }
>   
>   /*
> - * Not super efficient, but this is just a registration time. And we do cache
> - * the last compound head, so generally we'll only do a full search if we don't
> - * match that one.
> - *
> - * We check if the given compound head page has already been accounted, to
> - * avoid double accounting it. This allows us to account the full size of the
> - * page, not just the constituent pages of a huge page.
> + * Undo hpage_acct_ref() calls made during io_buffer_account_pin() on failure.
> + * This operates on the pages array since imu->bvec isn't populated yet.
>    */
> -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
> -				  int nr_pages, struct page *hpage)
> +static void io_buffer_unaccount_hpages(struct io_ring_ctx *ctx,
> +				       struct page **pages, int nr_pages)
>   {
> -	int i, j;
> +	struct page *seen = NULL;
> +	int i;
> +
> +	if (!ctx->user)
> +		return;
>   
> -	/* check current page array */
>   	for (i = 0; i < nr_pages; i++) {
> +		struct page *hpage;
> +
>   		if (!PageCompound(pages[i]))
>   			continue;
> -		if (compound_head(pages[i]) == hpage)
> -			return true;
> -	}
> -
> -	/* check previously registered pages */
> -	for (i = 0; i < ctx->buf_table.nr; i++) {
> -		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
> -		struct io_mapped_ubuf *imu;
>   
> -		if (!node)
> +		hpage = compound_head(pages[i]);
> +		if (hpage == seen)
>   			continue;
> -		imu = node->buf;
> -		for (j = 0; j < imu->nr_bvecs; j++) {
> -			if (!PageCompound(imu->bvec[j].bv_page))
> -				continue;
> -			if (compound_head(imu->bvec[j].bv_page) == hpage)
> -				return true;
> -		}
> -	}
> +		seen = hpage;
>   
> -	return false;
> +		hpage_acct_unref(ctx, hpage);
> +		cond_resched();
> +	}
>   }
>   
>   static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
> -				 int nr_pages, struct io_mapped_ubuf *imu,
> -				 struct page **last_hpage)
> +				 int nr_pages)
>   {
> +	unsigned long acct_pages = 0;
> +	struct page *seen = NULL;
>   	int i, ret;
>   
> -	imu->acct_pages = 0;
> +	if (!ctx->user)
> +		return 0;
> +
>   	for (i = 0; i < nr_pages; i++) {
> +		struct page *hpage;
> +		bool acct_new;
> +
>   		if (!PageCompound(pages[i])) {
> -			imu->acct_pages++;
> -		} else {
> -			struct page *hpage;
> -
> -			hpage = compound_head(pages[i]);
> -			if (hpage == *last_hpage)
> -				continue;
> -			*last_hpage = hpage;
> -			if (headpage_already_acct(ctx, pages, i, hpage))
> -				continue;
> -			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
> +			acct_pages++;
> +			continue;
>   		}
> +
> +		hpage = compound_head(pages[i]);
> +		if (hpage == seen)
> +			continue;
> +		seen = hpage;
> +
> +		ret = hpage_acct_ref(ctx, hpage, &acct_new);
> +		if (ret) {
> +			io_buffer_unaccount_hpages(ctx, pages, i);
> +			return ret;
> +		}
> +		if (acct_new)
> +			acct_pages += page_size(hpage) >> PAGE_SHIFT;
> +		cond_resched();
>   	}
>   
> -	if (!imu->acct_pages)
> -		return 0;
> +	/* Try to account the memory */
> +	if (acct_pages) {
> +		ret = io_account_mem(ctx->user, ctx->mm_account, acct_pages);
> +		if (ret) {
> +			/* Undo the refs we just added */
> +			io_buffer_unaccount_hpages(ctx, pages, nr_pages);
> +			return ret;
> +		}
> +	}
>   
> -	ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
> -	if (ret)
> -		imu->acct_pages = 0;
> -	return ret;
> +	return 0;
>   }
>   
>   static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
> @@ -763,8 +847,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
>   }
>   
>   static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> -						   struct iovec *iov,
> -						   struct page **last_hpage)
> +						   struct iovec *iov)
>   {
>   	struct io_mapped_ubuf *imu = NULL;
>   	struct page **pages = NULL;
> @@ -811,7 +894,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>   		goto done;
>   
>   	imu->nr_bvecs = nr_pages;
> -	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
> +	ret = io_buffer_account_pin(ctx, pages, nr_pages);
>   	if (ret)
>   		goto done;
>   
> @@ -861,7 +944,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>   int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>   			    unsigned int nr_args, u64 __user *tags)
>   {
> -	struct page *last_hpage = NULL;
>   	struct io_rsrc_data data;
>   	struct iovec fast_iov, *iov = &fast_iov;
>   	const struct iovec __user *uvec;
> @@ -904,7 +986,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
>   			}
>   		}
>   
> -		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
> +		node = io_sqe_buffer_register(ctx, iov);
>   		if (IS_ERR(node)) {
>   			ret = PTR_ERR(node);
>   			break;
> @@ -971,7 +1053,6 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
>   
>   	imu->ubuf = 0;
>   	imu->len = blk_rq_bytes(rq);
> -	imu->acct_pages = 0;
>   	imu->folio_shift = PAGE_SHIFT;
>   	refcount_set(&imu->refs, 1);
>   	imu->release = release;
> @@ -1137,6 +1218,56 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
>   }
>   
>   /* Lock two rings at once. The rings must be different! */

This comment should be before lock_two_rings().

> +static int io_buffer_acct_cloned_hpages(struct io_ring_ctx *ctx,
> +					struct io_mapped_ubuf *imu)
> +{
> +	struct page *seen = NULL;
> +	int i, ret = 0;
> +
> +	if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
> +		return 0;
> +
> +	for (i = 0; i < imu->nr_bvecs; i++) {
> +		struct page *page = imu->bvec[i].bv_page;
> +		struct page *hpage;
> +		bool acct_new;
> +
> +		if (!PageCompound(page))
> +			continue;
> +
> +		hpage = compound_head(page);
> +		if (hpage == seen)
> +			continue;
> +		seen = hpage;
> +
> +		/* Atomically add reference for cloned buffer */
> +		ret = hpage_acct_ref(ctx, hpage, &acct_new);
> +		if (ret)
> +			break;
> +
> +		cond_resched();
> +	}
> +
> +	if (!ret)
> +		return 0;
> +
> +	/* Undo refs we added for bvecs [0..i) */
> +	seen = NULL;
> +	for (int j = 0; j < i; j++) {
> +		struct page *p = imu->bvec[j].bv_page;
> +		struct page *hp;
> +
> +		if (!PageCompound(p))
> +			continue;
> +		hp = compound_head(p);
> +		if (hp == seen)
> +			continue;
> +		seen = hp;
> +		hpage_acct_unref(ctx, hp);
> +	}
> +	return ret;
> +}
> +
>   static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
>   {
>   	if (ctx1 > ctx2)
> @@ -1218,6 +1349,14 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
>   
>   			refcount_inc(&src_node->buf->refs);
>   			dst_node->buf = src_node->buf;
> +			/* track compound references to clones */
> +			ret = io_buffer_acct_cloned_hpages(ctx, src_node->buf);
> +			if (ret) {
> +				refcount_dec(&src_node->buf->refs);
> +				io_cache_free(&ctx->node_cache, dst_node);
> +				io_rsrc_data_free(ctx, &data);
> +				return ret;
> +			}
>   		}
>   		data.nodes[off++] = dst_node;
>   		i++;
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 44e3386f7c1c..c0f8a18ec767 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -38,7 +38,6 @@ struct io_mapped_ubuf {
>   	unsigned int	nr_bvecs;
>   	unsigned int    folio_shift;
>   	refcount_t	refs;
> -	unsigned long	acct_pages;
>   	void		(*release)(void *);
>   	void		*priv;
>   	u8		flags;
> 

Thanks,

Clément


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers
  2026-05-04  7:31 ` Clément Léger
@ 2026-05-04  7:36   ` Jens Axboe
  0 siblings, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2026-05-04  7:36 UTC (permalink / raw)
  To: Clément Léger, io-uring

On 5/4/26 1:31 AM, Cl?ment L?ger wrote:
>>     static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
>> -            struct iovec *iov, struct page **last_hpage);
>> +                           struct iovec *iov);
>> +
>> +static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage,
>> +              bool *acct_new)
>> +{
>> +    unsigned long key = (unsigned long) hpage;
>> +    unsigned long count;
>> +    void *entry;
>> +    int ret;
>> +
>> +    lockdep_assert_held(&ctx->uring_lock);
>> +
>> +    entry = xa_load(&ctx->hpage_acct, key);
>> +    if (!entry) {
>> +        ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>> +    count = 1;
>> +    if (entry)
>> +        count = xa_to_value(entry) + 1;
> 
> Hi Jens,
> 
> Can't most of this be merged in the previous if/else ? ie:
> 
>     entry = xa_load(&ctx->hpage_acct, key);>
>     count = 1;
>     if (!entry) {
>         ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
>         if (ret)
>             return ret;
>         *acct_new = true;
>     } else {
>         count = xa_to_value(entry) + 1;
>         *acct_new = false;
>     }

Agree, that's nicer. I'll make that change!

>> +static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage)
>> +{
>> +    unsigned long key = (unsigned long) hpage;
>> +    unsigned long count;
>> +    void *entry;
>> +
>> +    lockdep_assert_held(&ctx->uring_lock);
>> +
>> +    entry = xa_load(&ctx->hpage_acct, key);
>> +    if (WARN_ON_ONCE(!entry))
>> +        return false;
>> +    count = xa_to_value(entry);
>> +    if (count == 1)
>> +        xa_erase(&ctx->hpage_acct, key);
>> +    else
>> +        xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT);
>> +    return count == 1;
> 
> Maybe something like this could easier to read ?:
> 
>     if (count == 1) {
>         xa_erase(&ctx->hpage_acct, key);
>         return true;
>     }
>     
>     xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT);
>     return false;

Also agree, that's easier to read.

>> @@ -971,7 +1053,6 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
>>         imu->ubuf = 0;
>>       imu->len = blk_rq_bytes(rq);
>> -    imu->acct_pages = 0;
>>       imu->folio_shift = PAGE_SHIFT;
>>       refcount_set(&imu->refs, 1);
>>       imu->release = release;
>> @@ -1137,6 +1218,56 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
>>   }
>>     /* Lock two rings at once. The rings must be different! */
> 
> This comment should be before lock_two_rings().

Yeah, I think this happened during forward porting, the patch is
originally from 3 months ago. I'll fix that up too.

Thanks for taking a look!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-05-04  7:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-02  2:31 [PATCH v3] io_uring/rsrc: add huge page accounting for registered buffers Jens Axboe
2026-05-04  7:31 ` Clément Léger
2026-05-04  7:36   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox