public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Yuhao Jiang <danisjiang@gmail.com>
To: Jens Axboe <axboe@kernel.dk>, Pavel Begunkov <asml.silence@gmail.com>
Cc: io-uring@vger.kernel.org, linux-kernel@vger.kernel.org,
	Yuhao Jiang <danisjiang@gmail.com>,
	stable@vger.kernel.org
Subject: [PATCH] io_uring/rsrc: fix RLIMIT_MEMLOCK bypass via compound page accounting
Date: Wed, 17 Dec 2025 20:59:47 -0600	[thread overview]
Message-ID: <20251218025947.36115-1-danisjiang@gmail.com> (raw)

When multiple registered buffers share the same compound page, only the
first buffer accounts for the memory via io_buffer_account_pin(). The
subsequent buffers skip accounting since headpage_already_acct() returns
true.

When the first buffer is unregistered, the accounting is decremented,
but the compound page remains pinned by the remaining buffers. This
creates a state where pinned memory is not properly accounted against
RLIMIT_MEMLOCK.

On systems with HugeTLB pages pre-allocated, an unprivileged user can
exploit this to pin memory beyond RLIMIT_MEMLOCK by cycling buffer
registrations. The bypass amount is proportional to the number of
available huge pages, potentially allowing gigabytes of memory to be
pinned while the kernel accounting shows near-zero.

Fix this by recalculating the actual pages to unaccount when unmapping
a buffer. For regular pages, always unaccount. For compound pages, only
unaccount if no other registered buffer references the same compound
page. This ensures the accounting persists until the last buffer
referencing the compound page is released.

Reported-by: Yuhao Jiang <danisjiang@gmail.com>
Fixes: 57bebf807e2a ("io_uring/rsrc: optimise registered huge pages")
Cc: stable@vger.kernel.org
Signed-off-by: Yuhao Jiang <danisjiang@gmail.com>
---
 io_uring/rsrc.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a63474b331bf..dcf2340af5a2 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -139,15 +139,80 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
 		kvfree(imu);
 }
 
+/*
+ * Calculate pages to unaccount when unmapping a buffer. Regular pages are
+ * always counted. Compound pages are only counted if no other registered
+ * buffer references them, ensuring accounting persists until the last user.
+ */
+static unsigned long io_buffer_calc_unaccount(struct io_ring_ctx *ctx,
+					      struct io_mapped_ubuf *imu)
+{
+	struct page *last_hpage = NULL;
+	unsigned long acct = 0;
+	unsigned int i;
+
+	for (i = 0; i < imu->nr_bvecs; i++) {
+		struct page *page = imu->bvec[i].bv_page;
+		struct page *hpage;
+		unsigned int j;
+
+		if (!PageCompound(page)) {
+			acct++;
+			continue;
+		}
+
+		hpage = compound_head(page);
+		if (hpage == last_hpage)
+			continue;
+		last_hpage = hpage;
+
+		/* Check if we already processed this hpage earlier in this buffer */
+		for (j = 0; j < i; j++) {
+			if (PageCompound(imu->bvec[j].bv_page) &&
+			    compound_head(imu->bvec[j].bv_page) == hpage)
+				goto next_hpage;
+		}
+
+		/* Only unaccount if no other buffer references this page */
+		for (j = 0; j < ctx->buf_table.nr; j++) {
+			struct io_rsrc_node *node = ctx->buf_table.nodes[j];
+			struct io_mapped_ubuf *other;
+			unsigned int k;
+
+			if (!node)
+				continue;
+			other = node->buf;
+			if (other == imu)
+				continue;
+
+			for (k = 0; k < other->nr_bvecs; k++) {
+				struct page *op = other->bvec[k].bv_page;
+
+				if (!PageCompound(op))
+					continue;
+				if (compound_head(op) == hpage)
+					goto next_hpage;
+			}
+		}
+		acct += page_size(hpage) >> PAGE_SHIFT;
+next_hpage:
+		;
+	}
+	return acct;
+}
+
 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
 {
+	unsigned long acct;
+
 	if (unlikely(refcount_read(&imu->refs) > 1)) {
 		if (!refcount_dec_and_test(&imu->refs))
 			return;
 	}
 
-	if (imu->acct_pages)
-		io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
+	acct = io_buffer_calc_unaccount(ctx, imu);
+	if (acct)
+		io_unaccount_mem(ctx->user, ctx->mm_account, acct);
 	imu->release(imu->priv);
 	io_free_imu(ctx, imu);
 }
-- 
2.34.1


                 reply	other threads:[~2025-12-18  3:00 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251218025947.36115-1-danisjiang@gmail.com \
    --to=danisjiang@gmail.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox