public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages()
@ 2024-03-21 14:44 Jens Axboe
  2024-03-21 14:44 ` [PATCH 1/6] io_uring/kbuf: get rid of lower BGID lists Jens Axboe
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:44 UTC (permalink / raw)
  To: io-uring

Hi,

This series cleans up kbuf management a bit.

First two patches get rid of our array of buffer_lists, as in my testing
there's no discernable difference between the xarray lookup and our
array. This also then gets rid of any difference between lower and higher
buffer group IDs, which is nice.

Patch 3 starts using vmap for the non-mmap case for provided buffer
rings, which means we can clean up the buffer indexing in
io_ring_buffer_select() as well as there's now no difference between
how we handle mmap vs gup versionf of buffer lists.

Patches 4 and 5 are prep patches for patch 6, which switches the mmap
buffer_list variant away from remap_pfn_range() and uses
vm_insert_pages() instead.

This is how it should've been done initially, and as per the diffstat,
it's a nice reduction in code as well.

 include/linux/io_uring_types.h |   4 -
 io_uring/io_uring.c            |  32 ++---
 io_uring/io_uring.h            |   3 -
 io_uring/kbuf.c                | 298 ++++++++++++++---------------------------
 io_uring/kbuf.h                |   8 +-
 mm/nommu.c                     |   7 +
 6 files changed, 119 insertions(+), 233 deletions(-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/6] io_uring/kbuf: get rid of lower BGID lists
  2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
@ 2024-03-21 14:44 ` Jens Axboe
  2024-03-21 14:44 ` [PATCH 2/6] io_uring/kbuf: get rid of bl->is_ready Jens Axboe
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:44 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Just rely on the xarray for any kind of bgid. This simplifies things, and
it really doesn't bring us much, if anything.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/io_uring.c            |  2 -
 io_uring/kbuf.c                | 70 ++++------------------------------
 3 files changed, 8 insertions(+), 65 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ea7e5488b3be..c9a1952a383a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -295,7 +295,6 @@ struct io_ring_ctx {
 
 		struct io_submit_state	submit_state;
 
-		struct io_buffer_list	*io_bl;
 		struct xarray		io_bl_xa;
 
 		struct io_hash_table	cancel_table_locked;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index de7f88df939c..5b80849fbb85 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -342,7 +342,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 err:
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
-	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 	return NULL;
@@ -2849,7 +2848,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_napi_free(ctx);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
-	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
 	kfree(ctx);
 }
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 693c26da4ee1..8bf0121f00af 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -17,8 +17,6 @@
 
 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
 
-#define BGID_ARRAY	64
-
 /* BIDs are addressed by a 16-bit field in a CQE */
 #define MAX_BIDS_PER_BGID (1 << 16)
 
@@ -40,13 +38,9 @@ struct io_buf_free {
 	int				inuse;
 };
 
-static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
-						   struct io_buffer_list *bl,
-						   unsigned int bgid)
+static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+							  unsigned int bgid)
 {
-	if (bl && bgid < BGID_ARRAY)
-		return &bl[bgid];
-
 	return xa_load(&ctx->io_bl_xa, bgid);
 }
 
@@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 {
 	lockdep_assert_held(&ctx->uring_lock);
 
-	return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
+	return __io_buffer_get_list(ctx, bgid);
 }
 
 static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -68,10 +62,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
 	 */
 	bl->bgid = bgid;
 	smp_store_release(&bl->is_ready, 1);
-
-	if (bgid < BGID_ARRAY)
-		return 0;
-
 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
 }
 
@@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
-static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
-{
-	struct io_buffer_list *bl;
-	int i;
-
-	bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
-	if (!bl)
-		return -ENOMEM;
-
-	for (i = 0; i < BGID_ARRAY; i++) {
-		INIT_LIST_HEAD(&bl[i].buf_list);
-		bl[i].bgid = i;
-	}
-
-	smp_store_release(&ctx->io_bl, bl);
-	return 0;
-}
-
 /*
  * Mark the given mapped range as free for reuse
  */
@@ -300,13 +272,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 	struct list_head *item, *tmp;
 	struct io_buffer *buf;
 	unsigned long index;
-	int i;
-
-	for (i = 0; i < BGID_ARRAY; i++) {
-		if (!ctx->io_bl)
-			break;
-		__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
-	}
 
 	xa_for_each(&ctx->io_bl_xa, index, bl) {
 		xa_erase(&ctx->io_bl_xa, bl->bgid);
@@ -489,12 +454,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	io_ring_submit_lock(ctx, issue_flags);
 
-	if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
-		ret = io_init_bl_list(ctx);
-		if (ret)
-			goto err;
-	}
-
 	bl = io_buffer_get_list(ctx, p->bgid);
 	if (unlikely(!bl)) {
 		bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
@@ -507,14 +466,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		if (ret) {
 			/*
 			 * Doesn't need rcu free as it was never visible, but
-			 * let's keep it consistent throughout. Also can't
-			 * be a lower indexed array group, as adding one
-			 * where lookup failed cannot happen.
+			 * let's keep it consistent throughout.
 			 */
-			if (p->bgid >= BGID_ARRAY)
-				kfree_rcu(bl, rcu);
-			else
-				WARN_ON_ONCE(1);
+			kfree_rcu(bl, rcu);
 			goto err;
 		}
 	}
@@ -679,12 +633,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (reg.ring_entries >= 65536)
 		return -EINVAL;
 
-	if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
-		int ret = io_init_bl_list(ctx);
-		if (ret)
-			return ret;
-	}
-
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (bl) {
 		/* if mapped buffer ring OR classic exists, don't allow */
@@ -734,10 +682,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 		return -EINVAL;
 
 	__io_remove_buffers(ctx, bl, -1U);
-	if (bl->bgid >= BGID_ARRAY) {
-		xa_erase(&ctx->io_bl_xa, bl->bgid);
-		kfree_rcu(bl, rcu);
-	}
+	xa_erase(&ctx->io_bl_xa, bl->bgid);
+	kfree_rcu(bl, rcu);
 	return 0;
 }
 
@@ -771,7 +717,7 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 {
 	struct io_buffer_list *bl;
 
-	bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
+	bl = __io_buffer_get_list(ctx, bgid);
 
 	if (!bl || !bl->is_mmap)
 		return NULL;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/6] io_uring/kbuf: get rid of bl->is_ready
  2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
  2024-03-21 14:44 ` [PATCH 1/6] io_uring/kbuf: get rid of lower BGID lists Jens Axboe
@ 2024-03-21 14:44 ` Jens Axboe
  2024-03-21 14:44 ` [PATCH 3/6] io_uring/kbuf: vmap pinned buffer ring Jens Axboe
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:44 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Now that xarray is being exclusively used for the buffer_list lookup,
this check is no longer needed. Get rid of it and the is_ready member.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.c | 8 --------
 io_uring/kbuf.h | 2 --
 2 files changed, 10 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 8bf0121f00af..011280d873e7 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -61,7 +61,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
 	 * always under the ->uring_lock, but the RCU lookup from mmap does.
 	 */
 	bl->bgid = bgid;
-	smp_store_release(&bl->is_ready, 1);
 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
 }
 
@@ -721,13 +720,6 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 
 	if (!bl || !bl->is_mmap)
 		return NULL;
-	/*
-	 * Ensure the list is fully setup. Only strictly needed for RCU lookup
-	 * via mmap, and in that case only for the array indexed groups. For
-	 * the xarray lookups, it's either visible and ready, or not at all.
-	 */
-	if (!smp_load_acquire(&bl->is_ready))
-		return NULL;
 
 	return bl->buf_ring;
 }
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 1c7b654ee726..fdbb10449513 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -29,8 +29,6 @@ struct io_buffer_list {
 	__u8 is_buf_ring;
 	/* ring mapped provided buffers, but mmap'ed by application */
 	__u8 is_mmap;
-	/* bl is visible from an RCU point of view for lookup */
-	__u8 is_ready;
 };
 
 struct io_buffer {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/6] io_uring/kbuf: vmap pinned buffer ring
  2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
  2024-03-21 14:44 ` [PATCH 1/6] io_uring/kbuf: get rid of lower BGID lists Jens Axboe
  2024-03-21 14:44 ` [PATCH 2/6] io_uring/kbuf: get rid of bl->is_ready Jens Axboe
@ 2024-03-21 14:44 ` Jens Axboe
  2024-03-21 14:44 ` [PATCH 4/6] io_uring/kbuf: protect io_buffer_list teardown with a reference Jens Axboe
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:44 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This avoids needing to care about HIGHMEM, and it makes the buffer
indexing easier as both ring provided buffer methods are now virtually
mapped in a contigious fashion.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 011280d873e7..72c15dde34d3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
+#include <linux/vmalloc.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -145,15 +146,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		req->flags |= REQ_F_BL_EMPTY;
 
 	head &= bl->mask;
-	/* mmaped buffers are always contig */
-	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
-		buf = &br->bufs[head];
-	} else {
-		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
-		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
-		buf = page_address(bl->buf_pages[index]);
-		buf += off;
-	}
+	buf = &br->bufs[head];
 	if (*len == 0 || *len > buf->len)
 		*len = buf->len;
 	req->flags |= REQ_F_BUFFER_RING;
@@ -240,6 +233,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 			for (j = 0; j < bl->buf_nr_pages; j++)
 				unpin_user_page(bl->buf_pages[j]);
 			kvfree(bl->buf_pages);
+			vunmap(bl->buf_ring);
 			bl->buf_pages = NULL;
 			bl->buf_nr_pages = 0;
 		}
@@ -490,9 +484,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 			    struct io_buffer_list *bl)
 {
-	struct io_uring_buf_ring *br;
+	struct io_uring_buf_ring *br = NULL;
+	int nr_pages, ret, i;
 	struct page **pages;
-	int i, nr_pages;
 
 	pages = io_pin_pages(reg->ring_addr,
 			     flex_array_size(br, bufs, reg->ring_entries),
@@ -500,18 +494,12 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
-	/*
-	 * Apparently some 32-bit boxes (ARM) will return highmem pages,
-	 * which then need to be mapped. We could support that, but it'd
-	 * complicate the code and slowdown the common cases quite a bit.
-	 * So just error out, returning -EINVAL just like we did on kernels
-	 * that didn't support mapped buffer rings.
-	 */
-	for (i = 0; i < nr_pages; i++)
-		if (PageHighMem(pages[i]))
-			goto error_unpin;
+	br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!br) {
+		ret = -ENOMEM;
+		goto error_unpin;
+	}
 
-	br = page_address(pages[0]);
 #ifdef SHM_COLOUR
 	/*
 	 * On platforms that have specific aliasing requirements, SHM_COLOUR
@@ -522,8 +510,10 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
 	 * this transparently.
 	 */
-	if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
+	if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
+		ret = -EINVAL;
 		goto error_unpin;
+	}
 #endif
 	bl->buf_pages = pages;
 	bl->buf_nr_pages = nr_pages;
@@ -535,7 +525,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	for (i = 0; i < nr_pages; i++)
 		unpin_user_page(pages[i]);
 	kvfree(pages);
-	return -EINVAL;
+	vunmap(br);
+	return ret;
 }
 
 /*
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/6] io_uring/kbuf: protect io_buffer_list teardown with a reference
  2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
                   ` (2 preceding siblings ...)
  2024-03-21 14:44 ` [PATCH 3/6] io_uring/kbuf: vmap pinned buffer ring Jens Axboe
@ 2024-03-21 14:44 ` Jens Axboe
  2024-03-21 14:45 ` [PATCH 5/6] mm: add nommu variant of vm_insert_pages() Jens Axboe
  2024-03-21 14:45 ` [PATCH 6/6] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring Jens Axboe
  5 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:44 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

No functional changes in this patch, just in preparation for being able
to keep the buffer list alive outside of the ctx->uring_lock.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.c | 15 +++++++++++----
 io_uring/kbuf.h |  2 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 72c15dde34d3..206f4d352e15 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -62,6 +62,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
 	 * always under the ->uring_lock, but the RCU lookup from mmap does.
 	 */
 	bl->bgid = bgid;
+	atomic_set(&bl->refs, 1);
 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
 }
 
@@ -259,6 +260,14 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	return i;
 }
 
+static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+	if (atomic_dec_and_test(&bl->refs)) {
+		__io_remove_buffers(ctx, bl, -1U);
+		kfree_rcu(bl, rcu);
+	}
+}
+
 void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
 	struct io_buffer_list *bl;
@@ -268,8 +277,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 
 	xa_for_each(&ctx->io_bl_xa, index, bl) {
 		xa_erase(&ctx->io_bl_xa, bl->bgid);
-		__io_remove_buffers(ctx, bl, -1U);
-		kfree_rcu(bl, rcu);
+		io_put_bl(ctx, bl);
 	}
 
 	/*
@@ -671,9 +679,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (!bl->is_buf_ring)
 		return -EINVAL;
 
-	__io_remove_buffers(ctx, bl, -1U);
 	xa_erase(&ctx->io_bl_xa, bl->bgid);
-	kfree_rcu(bl, rcu);
+	io_put_bl(ctx, bl);
 	return 0;
 }
 
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index fdbb10449513..8b868a1744e2 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -25,6 +25,8 @@ struct io_buffer_list {
 	__u16 head;
 	__u16 mask;
 
+	atomic_t refs;
+
 	/* ring mapped provided buffers */
 	__u8 is_buf_ring;
 	/* ring mapped provided buffers, but mmap'ed by application */
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/6] mm: add nommu variant of vm_insert_pages()
  2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
                   ` (3 preceding siblings ...)
  2024-03-21 14:44 ` [PATCH 4/6] io_uring/kbuf: protect io_buffer_list teardown with a reference Jens Axboe
@ 2024-03-21 14:45 ` Jens Axboe
  2024-03-21 14:45 ` [PATCH 6/6] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring Jens Axboe
  5 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:45 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

An identical one exists for vm_insert_page(), add one for
vm_insert_pages() to avoid needing to check for CONFIG_MMU in code using
it.

Signed-off-by: Jens Axboe <[email protected]>
---
 mm/nommu.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/nommu.c b/mm/nommu.c
index 5ec8f44e7ce9..a34a0e376611 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -355,6 +355,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_insert_page);
 
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+			struct page **pages, unsigned long *num)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
 int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
 			unsigned long num)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/6] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring
  2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
                   ` (4 preceding siblings ...)
  2024-03-21 14:45 ` [PATCH 5/6] mm: add nommu variant of vm_insert_pages() Jens Axboe
@ 2024-03-21 14:45 ` Jens Axboe
  5 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2024-03-21 14:45 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.

This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which  otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |   3 -
 io_uring/io_uring.c            |  30 ++----
 io_uring/io_uring.h            |   3 -
 io_uring/kbuf.c                | 178 +++++++++++++--------------------
 io_uring/kbuf.h                |   4 +-
 5 files changed, 82 insertions(+), 136 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index c9a1952a383a..f37caff64d05 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -370,9 +370,6 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_cache;
 
-	/* deferred free list, protected by ->uring_lock */
-	struct hlist_head	io_buf_list;
-
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5b80849fbb85..8ce36c5a37c4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	INIT_HLIST_HEAD(&ctx->io_buf_list);
 	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
 	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2615,7 +2614,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-void io_mem_free(void *ptr)
+static void io_mem_free(void *ptr)
 {
 	if (!ptr)
 		return;
@@ -2728,7 +2727,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
 	ctx->sq_sqes = NULL;
 }
 
-void *io_mem_alloc(size_t size)
+static void *io_mem_alloc(size_t size)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
 	void *ret;
@@ -2838,7 +2837,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 	io_rings_free(ctx);
-	io_kbuf_mmap_list_free(ctx);
 
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
@@ -3307,11 +3305,9 @@ static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
 	struct io_ring_ctx *ctx = file->private_data;
-	loff_t offset = pgoff << PAGE_SHIFT;
-	struct page *page;
 	void *ptr;
 
-	switch (offset & IORING_OFF_MMAP_MASK) {
+	switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
 	case IORING_OFF_CQ_RING:
 		/* Don't allow mmap if the ring was setup without it */
@@ -3325,23 +3321,13 @@ static void *io_uring_validate_mmap_request(struct file *file,
 			return ERR_PTR(-EINVAL);
 		ptr = ctx->sq_sqes;
 		break;
-	case IORING_OFF_PBUF_RING: {
-		unsigned int bgid;
-
-		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
-		rcu_read_lock();
-		ptr = io_pbuf_get_address(ctx, bgid);
-		rcu_read_unlock();
-		if (!ptr)
-			return ERR_PTR(-EINVAL);
-		break;
-		}
+	case IORING_OFF_PBUF_RING:
+		return 0;
 	default:
 		return ERR_PTR(-EINVAL);
 	}
 
-	page = virt_to_head_page(ptr);
-	if (sz > page_size(page))
+	if (sz > page_size(virt_to_head_page(ptr)))
 		return ERR_PTR(-EINVAL);
 
 	return ptr;
@@ -3352,6 +3338,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	size_t sz = vma->vm_end - vma->vm_start;
+	long offset = vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long pfn;
 	void *ptr;
 
@@ -3359,6 +3346,9 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
 
+	if ((offset & IORING_OFF_MMAP_MASK) == IORING_OFF_PBUF_RING)
+		return io_pbuf_mmap(file, vma);
+
 	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index bae8c1e937c1..050efc3e7973 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -108,9 +108,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 
-void *io_mem_alloc(size_t size);
-void io_mem_free(void *ptr);
-
 enum {
 	IO_EVENTFD_OP_SIGNAL_BIT,
 	IO_EVENTFD_OP_FREE_BIT,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 206f4d352e15..52210772da2f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -32,25 +32,12 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
-struct io_buf_free {
-	struct hlist_node		list;
-	void				*mem;
-	size_t				size;
-	int				inuse;
-};
-
-static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
-							  unsigned int bgid)
-{
-	return xa_load(&ctx->io_bl_xa, bgid);
-}
-
 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 							unsigned int bgid)
 {
 	lockdep_assert_held(&ctx->uring_lock);
 
-	return __io_buffer_get_list(ctx, bgid);
+	return xa_load(&ctx->io_bl_xa, bgid);
 }
 
 static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
-/*
- * Mark the given mapped range as free for reuse
- */
-static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
-{
-	struct io_buf_free *ibf;
-
-	hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
-		if (bl->buf_ring == ibf->mem) {
-			ibf->inuse = 0;
-			return;
-		}
-	}
-
-	/* can't happen... */
-	WARN_ON_ONCE(1);
-}
-
 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 			       struct io_buffer_list *bl, unsigned nbufs)
 {
@@ -220,23 +189,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 
 	if (bl->is_buf_ring) {
 		i = bl->buf_ring->tail - bl->head;
-		if (bl->is_mmap) {
-			/*
-			 * io_kbuf_list_free() will free the page(s) at
-			 * ->release() time.
-			 */
-			io_kbuf_mark_free(ctx, bl);
-			bl->buf_ring = NULL;
-			bl->is_mmap = 0;
-		} else if (bl->buf_nr_pages) {
+		if (bl->buf_nr_pages) {
 			int j;
 
-			for (j = 0; j < bl->buf_nr_pages; j++)
-				unpin_user_page(bl->buf_pages[j]);
+			for (j = 0; j < bl->buf_nr_pages; j++) {
+				if (bl->is_mmap)
+					put_page(bl->buf_pages[j]);
+				else
+					unpin_user_page(bl->buf_pages[j]);
+			}
 			kvfree(bl->buf_pages);
 			vunmap(bl->buf_ring);
 			bl->buf_pages = NULL;
 			bl->buf_nr_pages = 0;
+			bl->is_mmap = 0;
 		}
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
@@ -537,63 +503,48 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	return ret;
 }
 
-/*
- * See if we have a suitable region that we can reuse, rather than allocate
- * both a new io_buf_free and mem region again. We leave it on the list as
- * even a reused entry will need freeing at ring release.
- */
-static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
-						    size_t ring_size)
+static int io_alloc_map_pages(struct io_buffer_list *bl, size_t ring_size)
 {
-	struct io_buf_free *ibf, *best = NULL;
-	size_t best_dist;
-
-	hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
-		size_t dist;
-
-		if (ibf->inuse || ibf->size < ring_size)
-			continue;
-		dist = ibf->size - ring_size;
-		if (!best || dist < best_dist) {
-			best = ibf;
-			if (!dist)
-				break;
-			best_dist = dist;
-		}
+	int i, nr_pages;
+
+	nr_pages = (ring_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	bl->buf_pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!bl->buf_pages)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_pages; i++) {
+		bl->buf_pages[i] = alloc_page(GFP_KERNEL);
+		if (!bl->buf_pages[i])
+			goto out_free;
 	}
 
-	return best;
+	bl->buf_ring = vmap(bl->buf_pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (bl->buf_ring) {
+		bl->buf_nr_pages = nr_pages;
+		return 0;
+	}
+out_free:
+	while (i--)
+		put_page(bl->buf_pages[i]);
+	kvfree(bl->buf_pages);
+	bl->buf_pages = NULL;
+	bl->buf_nr_pages = 0;
+	return -ENOMEM;
 }
 
 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 			      struct io_uring_buf_reg *reg,
 			      struct io_buffer_list *bl)
 {
-	struct io_buf_free *ibf;
 	size_t ring_size;
-	void *ptr;
+	int ret;
 
 	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
 
-	/* Reuse existing entry, if we can */
-	ibf = io_lookup_buf_free_entry(ctx, ring_size);
-	if (!ibf) {
-		ptr = io_mem_alloc(ring_size);
-		if (IS_ERR(ptr))
-			return PTR_ERR(ptr);
-
-		/* Allocate and store deferred free entry */
-		ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
-		if (!ibf) {
-			io_mem_free(ptr);
-			return -ENOMEM;
-		}
-		ibf->mem = ptr;
-		ibf->size = ring_size;
-		hlist_add_head(&ibf->list, &ctx->io_buf_list);
-	}
-	ibf->inuse = 1;
-	bl->buf_ring = ibf->mem;
+	ret = io_alloc_map_pages(bl, ring_size);
+	if (ret)
+		return ret;
+
 	bl->is_buf_ring = 1;
 	bl->is_mmap = 1;
 	return 0;
@@ -710,30 +661,43 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 	return 0;
 }
 
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct io_ring_ctx *ctx = file->private_data;
+	loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
 	struct io_buffer_list *bl;
+	unsigned long npages;
+	int bgid, ret;
 
-	bl = __io_buffer_get_list(ctx, bgid);
+	bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
 
-	if (!bl || !bl->is_mmap)
-		return NULL;
+	/*
+	 * We have to be a bit careful here - we're inside mmap and cannot
+	 * grab the uring_lock. This means the buffer_list could be
+	 * simultaneously going away, if someone is trying to be sneaky.
+	 * Look it up under rcu so we now it's not going away, and attempt
+	 * to grab a reference to it. If the ref is already zero, then fail
+	 * the mapping. If successful, we'll drop the reference at at the end.
+	 * This may then safely free the buffer_list (and drop the pages) at
+	 * that point, vm_insert_pages() would've already grabbed the
+	 * necessary vma references.
+	  */
+	ret = 0;
+	rcu_read_lock();
+	bl = xa_load(&ctx->io_bl_xa, bgid);
+	/* must be a mmap'able buffer ring and have pages */
+	if (bl && bl->is_mmap && bl->buf_nr_pages)
+		ret = atomic_inc_not_zero(&bl->refs);
+	rcu_read_unlock();
+
+	/* buffer list is invalid or being torn down, fail the mapping */
+	if (!ret)
+		return -EINVAL;
 
-	return bl->buf_ring;
-}
+	vm_flags_set(vma, VM_DONTEXPAND);
 
-/*
- * Called at or after ->release(), free the mmap'ed buffers that we used
- * for memory mapped provided buffer rings.
- */
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
-{
-	struct io_buf_free *ibf;
-	struct hlist_node *tmp;
-
-	hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
-		hlist_del(&ibf->list);
-		io_mem_free(ibf->mem);
-		kfree(ibf);
-	}
+	npages = bl->buf_nr_pages;
+	ret = vm_insert_pages(vma, vma->vm_start, bl->buf_pages, &npages);
+	io_put_bl(ctx, bl);
+	return ret;
 }
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 8b868a1744e2..0723a6ffe731 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,13 +55,11 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
 
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
-
 void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
 
 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-03-21 14:48 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-21 14:44 [PATCHSET 0/6] Switch kbuf mappings to vm_insert_pages() Jens Axboe
2024-03-21 14:44 ` [PATCH 1/6] io_uring/kbuf: get rid of lower BGID lists Jens Axboe
2024-03-21 14:44 ` [PATCH 2/6] io_uring/kbuf: get rid of bl->is_ready Jens Axboe
2024-03-21 14:44 ` [PATCH 3/6] io_uring/kbuf: vmap pinned buffer ring Jens Axboe
2024-03-21 14:44 ` [PATCH 4/6] io_uring/kbuf: protect io_buffer_list teardown with a reference Jens Axboe
2024-03-21 14:45 ` [PATCH 5/6] mm: add nommu variant of vm_insert_pages() Jens Axboe
2024-03-21 14:45 ` [PATCH 6/6] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox