* [PATCH 01/11] mm: add nommu variant of vm_insert_pages()
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes Jens Axboe
` (9 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
An identical one exists for vm_insert_page(), add one for
vm_insert_pages() to avoid needing to check for CONFIG_MMU in code using
it.
Acked-by: Johannes Weiner <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
mm/nommu.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/mm/nommu.c b/mm/nommu.c
index 5ec8f44e7ce9..a34a0e376611 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -355,6 +355,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num)
{
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
2024-03-28 23:31 ` [PATCH 01/11] mm: add nommu variant of vm_insert_pages() Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-30 3:50 ` Gabriel Krisman Bertazi
2024-03-28 23:31 ` [PATCH 03/11] io_uring: use vmap() for ring mapping Jens Axboe
` (8 subsequent siblings)
10 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_pages() and have it Just Work.
If possible, allocate a single compound page that covers the range that
is needed. If that works, then we can just use page_address() on that
page. If we fail to get a compound page, allocate single pages and use
vmap() to map them into the kernel virtual address space.
This just covers the rings/sqes, the other remaining user of the mmap
remap_pfn_range() user will be converted separately. Once that is done,
we can kill the old alloc/free code.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/io_uring.c | 136 +++++++++++++++++++++++++++++++++++++++++---
io_uring/io_uring.h | 2 +
2 files changed, 130 insertions(+), 8 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 104899522bc5..982545ca23f9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2594,6 +2594,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
+static void io_pages_unmap(void *ptr, struct page ***pages,
+ unsigned short *npages)
+{
+ bool do_vunmap = false;
+
+ if (*npages) {
+ struct page **to_free = *pages;
+ int i;
+
+ /*
+ * Only did vmap for the non-compound multiple page case.
+ * For the compound page, we just need to put the head.
+ */
+ if (PageCompound(to_free[0]))
+ *npages = 1;
+ else if (*npages > 1)
+ do_vunmap = true;
+ for (i = 0; i < *npages; i++)
+ put_page(to_free[i]);
+ }
+ if (do_vunmap)
+ vunmap(ptr);
+ kvfree(*pages);
+ *pages = NULL;
+ *npages = 0;
+}
+
void io_mem_free(void *ptr)
{
if (!ptr)
@@ -2694,8 +2721,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
static void io_rings_free(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
+ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
+ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
@@ -2707,6 +2734,80 @@ static void io_rings_free(struct io_ring_ctx *ctx)
ctx->sq_sqes = NULL;
}
+static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
+ size_t size, gfp_t gfp)
+{
+ struct page *page;
+ int i, order;
+
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return NULL;
+ else if (order)
+ gfp |= __GFP_COMP;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++)
+ pages[i] = page + i;
+
+ return page_address(page);
+}
+
+static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
+ gfp_t gfp)
+{
+ void *ret;
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(gfp);
+ if (!pages[i])
+ goto err;
+ }
+
+ ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (ret)
+ return ret;
+err:
+ while (i--)
+ put_page(pages[i]);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ struct page **pages;
+ int nr_pages;
+ void *ret;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+ if (ret)
+ goto done;
+
+ ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+ if (ret) {
+done:
+ *out_pages = pages;
+ *npages = nr_pages;
+ return ret;
+ }
+
+ kvfree(pages);
+ *out_pages = NULL;
+ *npages = 0;
+ return ERR_PTR(-ENOMEM);
+}
+
void *io_mem_alloc(size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
@@ -3294,14 +3395,12 @@ static void *io_uring_validate_mmap_request(struct file *file,
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
- ptr = ctx->rings;
- break;
+ return ctx->rings;
case IORING_OFF_SQES:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
- ptr = ctx->sq_sqes;
- break;
+ return ctx->sq_sqes;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;
@@ -3324,11 +3423,22 @@ static void *io_uring_validate_mmap_request(struct file *file,
return ptr;
}
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages)
+{
+ unsigned long nr_pages = npages;
+
+ vm_flags_set(vma, VM_DONTEXPAND);
+ return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+}
+
#ifdef CONFIG_MMU
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
+ long offset = vma->vm_pgoff << PAGE_SHIFT;
unsigned long pfn;
void *ptr;
@@ -3336,6 +3446,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
if (IS_ERR(ptr))
return PTR_ERR(ptr);
+ switch (offset & IORING_OFF_MMAP_MASK) {
+ case IORING_OFF_SQ_RING:
+ case IORING_OFF_CQ_RING:
+ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
+ ctx->n_ring_pages);
+ case IORING_OFF_SQES:
+ return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+ ctx->n_sqe_pages);
+ }
+
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
@@ -3625,7 +3745,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- rings = io_mem_alloc(size);
+ rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
else
rings = io_rings_map(ctx, p->cq_off.user_addr, size);
@@ -3650,7 +3770,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
}
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- ptr = io_mem_alloc(size);
+ ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
else
ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dbd9a2b870eb..75230d914007 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -70,6 +70,8 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes
2024-03-28 23:31 ` [PATCH 02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes Jens Axboe
@ 2024-03-30 3:50 ` Gabriel Krisman Bertazi
2024-03-30 15:14 ` Jens Axboe
0 siblings, 1 reply; 14+ messages in thread
From: Gabriel Krisman Bertazi @ 2024-03-30 3:50 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, hannes
Jens Axboe <[email protected]> writes:
> Rather than use remap_pfn_range() for this and manually free later,
> switch to using vm_insert_pages() and have it Just Work.
>
> If possible, allocate a single compound page that covers the range that
> is needed. If that works, then we can just use page_address() on that
> page. If we fail to get a compound page, allocate single pages and use
> vmap() to map them into the kernel virtual address space.
>
> This just covers the rings/sqes, the other remaining user of the mmap
> remap_pfn_range() user will be converted separately. Once that is done,
> we can kill the old alloc/free code.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> io_uring/io_uring.c | 136 +++++++++++++++++++++++++++++++++++++++++---
> io_uring/io_uring.h | 2 +
> 2 files changed, 130 insertions(+), 8 deletions(-)
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 104899522bc5..982545ca23f9 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2594,6 +2594,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
> }
>
> +static void io_pages_unmap(void *ptr, struct page ***pages,
> + unsigned short *npages)
> +{
> + bool do_vunmap = false;
> +
> + if (*npages) {
> + struct page **to_free = *pages;
> + int i;
> +
> + /*
> + * Only did vmap for the non-compound multiple page case.
> + * For the compound page, we just need to put the head.
> + */
> + if (PageCompound(to_free[0]))
> + *npages = 1;
> + else if (*npages > 1)
> + do_vunmap = true;
> + for (i = 0; i < *npages; i++)
> + put_page(to_free[i]);
> + }
Hi Jens,
wouldn't it be simpler to handle the compound case separately as a
folio? Then you folio_put the compound page here and just handle the
non-continuous case after.
--
Gabriel Krisman Bertazi
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes
2024-03-30 3:50 ` Gabriel Krisman Bertazi
@ 2024-03-30 15:14 ` Jens Axboe
0 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-30 15:14 UTC (permalink / raw)
To: Gabriel Krisman Bertazi; +Cc: io-uring, hannes
On 3/29/24 9:50 PM, Gabriel Krisman Bertazi wrote:
> Jens Axboe <[email protected]> writes:
>
>> Rather than use remap_pfn_range() for this and manually free later,
>> switch to using vm_insert_pages() and have it Just Work.
>>
>> If possible, allocate a single compound page that covers the range that
>> is needed. If that works, then we can just use page_address() on that
>> page. If we fail to get a compound page, allocate single pages and use
>> vmap() to map them into the kernel virtual address space.
>>
>> This just covers the rings/sqes, the other remaining user of the mmap
>> remap_pfn_range() user will be converted separately. Once that is done,
>> we can kill the old alloc/free code.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
>> ---
>> io_uring/io_uring.c | 136 +++++++++++++++++++++++++++++++++++++++++---
>> io_uring/io_uring.h | 2 +
>> 2 files changed, 130 insertions(+), 8 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 104899522bc5..982545ca23f9 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2594,6 +2594,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>> return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
>> }
>>
>> +static void io_pages_unmap(void *ptr, struct page ***pages,
>> + unsigned short *npages)
>> +{
>> + bool do_vunmap = false;
>> +
>> + if (*npages) {
>> + struct page **to_free = *pages;
>> + int i;
>> +
>> + /*
>> + * Only did vmap for the non-compound multiple page case.
>> + * For the compound page, we just need to put the head.
>> + */
>> + if (PageCompound(to_free[0]))
>> + *npages = 1;
>> + else if (*npages > 1)
>> + do_vunmap = true;
>> + for (i = 0; i < *npages; i++)
>> + put_page(to_free[i]);
>> + }
>
> Hi Jens,
>
> wouldn't it be simpler to handle the compound case separately as a
> folio? Then you folio_put the compound page here and just handle the
> non-continuous case after.
I don't think it makes sense, as we're still dealing with pages for
insertion. Once there's some folio variant of inserting pages, then yeah
I think it'd make sense to unify it. If not, we're doing the page <->
folio transition in one spot anyway.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 03/11] io_uring: use vmap() for ring mapping
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
2024-03-28 23:31 ` [PATCH 01/11] mm: add nommu variant of vm_insert_pages() Jens Axboe
2024-03-28 23:31 ` [PATCH 02/11] io_uring: get rid of remap_pfn_range() for mapping rings/sqes Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 04/11] io_uring: unify io_pin_pages() Jens Axboe
` (7 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
This is the last holdout which does odd page checking, convert it to
vmap just like what is done for the non-mmap path.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/io_uring.c | 38 +++++++++-----------------------------
1 file changed, 9 insertions(+), 29 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 982545ca23f9..4c6eeb299e5d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -63,7 +63,6 @@
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
-#include <linux/highmem.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/task_work.h>
@@ -2649,7 +2648,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
- int ret, i, pinned;
+ int ret, pinned;
*npages = 0;
@@ -2671,34 +2670,13 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
goto free_pages;
}
- page_addr = page_address(page_array[0]);
- for (i = 0; i < nr_pages; i++) {
- ret = -EINVAL;
-
- /*
- * Can't support mapping user allocated ring memory on 32-bit
- * archs where it could potentially reside in highmem. Just
- * fail those with -EINVAL, just like we did on kernels that
- * didn't support this feature.
- */
- if (PageHighMem(page_array[i]))
- goto free_pages;
-
- /*
- * No support for discontig pages for now, should either be a
- * single normal page, or a huge page. Later on we can add
- * support for remapping discontig pages, for now we will
- * just fail them with EINVAL.
- */
- if (page_address(page_array[i]) != page_addr)
- goto free_pages;
- page_addr += PAGE_SIZE;
+ page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (page_addr) {
+ *pages = page_array;
+ *npages = nr_pages;
+ return page_addr;
}
-
- *pages = page_array;
- *npages = nr_pages;
- return page_to_virt(page_array[0]);
-
+ ret = -ENOMEM;
free_pages:
io_pages_free(&page_array, pinned > 0 ? pinned : 0);
return ERR_PTR(ret);
@@ -2728,6 +2706,8 @@ static void io_rings_free(struct io_ring_ctx *ctx)
ctx->n_ring_pages = 0;
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
ctx->n_sqe_pages = 0;
+ vunmap(ctx->rings);
+ vunmap(ctx->sq_sqes);
}
ctx->rings = NULL;
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 04/11] io_uring: unify io_pin_pages()
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (2 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 03/11] io_uring: use vmap() for ring mapping Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 05/11] io_uring/kbuf: get rid of lower BGID lists Jens Axboe
` (6 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
Move it into io_uring.c where it belongs, and use it in there as well
rather than have two implementations of this.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/io_uring.c | 61 +++++++++++++++++++++++++++++++--------------
io_uring/rsrc.c | 36 --------------------------
2 files changed, 42 insertions(+), 55 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4c6eeb299e5d..3aac7fbee499 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2642,33 +2642,57 @@ static void io_pages_free(struct page ***pages, int npages)
*pages = NULL;
}
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+{
+ unsigned long start, end, nr_pages;
+ struct page **pages;
+ int ret;
+
+ end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = uaddr >> PAGE_SHIFT;
+ nr_pages = end - start;
+ if (WARN_ON_ONCE(!nr_pages))
+ return ERR_PTR(-EINVAL);
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
+ *npages = nr_pages;
+ return pages;
+ }
+
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
+ /* if we did partial map, release any pages we did get */
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
+ }
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
unsigned long uaddr, size_t size)
{
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
- int ret, pinned;
*npages = 0;
if (uaddr & (PAGE_SIZE - 1) || !size)
return ERR_PTR(-EINVAL);
- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (nr_pages > USHRT_MAX)
- return ERR_PTR(-EINVAL);
- page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!page_array)
- return ERR_PTR(-ENOMEM);
-
-
- pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- page_array);
- if (pinned != nr_pages) {
- ret = (pinned < 0) ? pinned : -EFAULT;
- goto free_pages;
- }
+ nr_pages = 0;
+ page_array = io_pin_pages(uaddr, size, &nr_pages);
+ if (IS_ERR(page_array))
+ return page_array;
page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
if (page_addr) {
@@ -2676,10 +2700,9 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
*npages = nr_pages;
return page_addr;
}
- ret = -ENOMEM;
-free_pages:
- io_pages_free(&page_array, pinned > 0 ? pinned : 0);
- return ERR_PTR(ret);
+
+ io_pages_free(&page_array, nr_pages);
+ return ERR_PTR(-ENOMEM);
}
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7b8a056f98ed..8a34181c97ab 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -870,42 +870,6 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
-{
- unsigned long start, end, nr_pages;
- struct page **pages = NULL;
- int ret;
-
- end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
- WARN_ON(!nr_pages);
-
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- mmap_read_lock(current->mm);
- ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
- mmap_read_unlock(current->mm);
-
- /* success, mapped all pages */
- if (ret == nr_pages) {
- *npages = nr_pages;
- return pages;
- }
-
- /* partial map, or didn't map anything */
- if (ret >= 0) {
- /* if we did partial map, release any pages we did get */
- if (ret)
- unpin_user_pages(pages, ret);
- ret = -EFAULT;
- }
- kvfree(pages);
- return ERR_PTR(ret);
-}
-
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage)
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 05/11] io_uring/kbuf: get rid of lower BGID lists
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (3 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 04/11] io_uring: unify io_pin_pages() Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 06/11] io_uring/kbuf: get rid of bl->is_ready Jens Axboe
` (5 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
Just rely on the xarray for any kind of bgid. This simplifies things, and
it really doesn't bring us much, if anything.
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/io_uring_types.h | 1 -
io_uring/io_uring.c | 2 -
io_uring/kbuf.c | 70 ++++------------------------------
3 files changed, 8 insertions(+), 65 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b191710bec4f..8c64c303dee8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -295,7 +295,6 @@ struct io_ring_ctx {
struct io_submit_state submit_state;
- struct io_buffer_list *io_bl;
struct xarray io_bl_xa;
struct io_hash_table cancel_table_locked;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3aac7fbee499..17db5b2aa4b5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -353,7 +353,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
io_futex_cache_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
@@ -2928,7 +2927,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_napi_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
- kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 693c26da4ee1..8bf0121f00af 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -17,8 +17,6 @@
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
-#define BGID_ARRAY 64
-
/* BIDs are addressed by a 16-bit field in a CQE */
#define MAX_BIDS_PER_BGID (1 << 16)
@@ -40,13 +38,9 @@ struct io_buf_free {
int inuse;
};
-static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl,
- unsigned int bgid)
+static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- if (bl && bgid < BGID_ARRAY)
- return &bl[bgid];
-
return xa_load(&ctx->io_bl_xa, bgid);
}
@@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
{
lockdep_assert_held(&ctx->uring_lock);
- return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
+ return __io_buffer_get_list(ctx, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -68,10 +62,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
*/
bl->bgid = bgid;
smp_store_release(&bl->is_ready, 1);
-
- if (bgid < BGID_ARRAY)
- return 0;
-
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret;
}
-static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
-{
- struct io_buffer_list *bl;
- int i;
-
- bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- INIT_LIST_HEAD(&bl[i].buf_list);
- bl[i].bgid = i;
- }
-
- smp_store_release(&ctx->io_bl, bl);
- return 0;
-}
-
/*
* Mark the given mapped range as free for reuse
*/
@@ -300,13 +272,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
struct list_head *item, *tmp;
struct io_buffer *buf;
unsigned long index;
- int i;
-
- for (i = 0; i < BGID_ARRAY; i++) {
- if (!ctx->io_bl)
- break;
- __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
- }
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
@@ -489,12 +454,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
io_ring_submit_lock(ctx, issue_flags);
- if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
- ret = io_init_bl_list(ctx);
- if (ret)
- goto err;
- }
-
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
@@ -507,14 +466,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (ret) {
/*
* Doesn't need rcu free as it was never visible, but
- * let's keep it consistent throughout. Also can't
- * be a lower indexed array group, as adding one
- * where lookup failed cannot happen.
+ * let's keep it consistent throughout.
*/
- if (p->bgid >= BGID_ARRAY)
- kfree_rcu(bl, rcu);
- else
- WARN_ON_ONCE(1);
+ kfree_rcu(bl, rcu);
goto err;
}
}
@@ -679,12 +633,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.ring_entries >= 65536)
return -EINVAL;
- if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
- int ret = io_init_bl_list(ctx);
- if (ret)
- return ret;
- }
-
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
@@ -734,10 +682,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
__io_remove_buffers(ctx, bl, -1U);
- if (bl->bgid >= BGID_ARRAY) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
- kfree_rcu(bl, rcu);
- }
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ kfree_rcu(bl, rcu);
return 0;
}
@@ -771,7 +717,7 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
- bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
+ bl = __io_buffer_get_list(ctx, bgid);
if (!bl || !bl->is_mmap)
return NULL;
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 06/11] io_uring/kbuf: get rid of bl->is_ready
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (4 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 05/11] io_uring/kbuf: get rid of lower BGID lists Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 07/11] io_uring/kbuf: vmap pinned buffer ring Jens Axboe
` (4 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
Now that xarray is being exclusively used for the buffer_list lookup,
this check is no longer needed. Get rid of it and the is_ready member.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/kbuf.c | 8 --------
io_uring/kbuf.h | 2 --
2 files changed, 10 deletions(-)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 8bf0121f00af..011280d873e7 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -61,7 +61,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid;
- smp_store_release(&bl->is_ready, 1);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@@ -721,13 +720,6 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
if (!bl || !bl->is_mmap)
return NULL;
- /*
- * Ensure the list is fully setup. Only strictly needed for RCU lookup
- * via mmap, and in that case only for the array indexed groups. For
- * the xarray lookups, it's either visible and ready, or not at all.
- */
- if (!smp_load_acquire(&bl->is_ready))
- return NULL;
return bl->buf_ring;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 1c7b654ee726..fdbb10449513 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -29,8 +29,6 @@ struct io_buffer_list {
__u8 is_buf_ring;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
- /* bl is visible from an RCU point of view for lookup */
- __u8 is_ready;
};
struct io_buffer {
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 07/11] io_uring/kbuf: vmap pinned buffer ring
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (5 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 06/11] io_uring/kbuf: get rid of bl->is_ready Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 08/11] io_uring/kbuf: protect io_buffer_list teardown with a reference Jens Axboe
` (3 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
This avoids needing to care about HIGHMEM, and it makes the buffer
indexing easier as both ring provided buffer methods are now virtually
mapped in a contigious fashion.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/kbuf.c | 39 +++++++++++++++------------------------
1 file changed, 15 insertions(+), 24 deletions(-)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 011280d873e7..72c15dde34d3 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/poll.h>
+#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -145,15 +146,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->flags |= REQ_F_BL_EMPTY;
head &= bl->mask;
- /* mmaped buffers are always contig */
- if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
- buf = &br->bufs[head];
- } else {
- int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
- int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
- buf = page_address(bl->buf_pages[index]);
- buf += off;
- }
+ buf = &br->bufs[head];
if (*len == 0 || *len > buf->len)
*len = buf->len;
req->flags |= REQ_F_BUFFER_RING;
@@ -240,6 +233,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
kvfree(bl->buf_pages);
+ vunmap(bl->buf_ring);
bl->buf_pages = NULL;
bl->buf_nr_pages = 0;
}
@@ -490,9 +484,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- struct io_uring_buf_ring *br;
+ struct io_uring_buf_ring *br = NULL;
+ int nr_pages, ret, i;
struct page **pages;
- int i, nr_pages;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
@@ -500,18 +494,12 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
if (IS_ERR(pages))
return PTR_ERR(pages);
- /*
- * Apparently some 32-bit boxes (ARM) will return highmem pages,
- * which then need to be mapped. We could support that, but it'd
- * complicate the code and slowdown the common cases quite a bit.
- * So just error out, returning -EINVAL just like we did on kernels
- * that didn't support mapped buffer rings.
- */
- for (i = 0; i < nr_pages; i++)
- if (PageHighMem(pages[i]))
- goto error_unpin;
+ br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!br) {
+ ret = -ENOMEM;
+ goto error_unpin;
+ }
- br = page_address(pages[0]);
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
@@ -522,8 +510,10 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
- if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
+ if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
+ ret = -EINVAL;
goto error_unpin;
+ }
#endif
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
@@ -535,7 +525,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
for (i = 0; i < nr_pages; i++)
unpin_user_page(pages[i]);
kvfree(pages);
- return -EINVAL;
+ vunmap(br);
+ return ret;
}
/*
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 08/11] io_uring/kbuf: protect io_buffer_list teardown with a reference
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (6 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 07/11] io_uring/kbuf: vmap pinned buffer ring Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 09/11] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring Jens Axboe
` (2 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
No functional changes in this patch, just in preparation for being able
to keep the buffer list alive outside of the ctx->uring_lock.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/kbuf.c | 15 +++++++++++----
io_uring/kbuf.h | 2 ++
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 72c15dde34d3..206f4d352e15 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -62,6 +62,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid;
+ atomic_set(&bl->refs, 1);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@@ -259,6 +260,14 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
+static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ if (atomic_dec_and_test(&bl->refs)) {
+ __io_remove_buffers(ctx, bl, -1U);
+ kfree_rcu(bl, rcu);
+ }
+}
+
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
@@ -268,8 +277,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
- __io_remove_buffers(ctx, bl, -1U);
- kfree_rcu(bl, rcu);
+ io_put_bl(ctx, bl);
}
/*
@@ -671,9 +679,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!bl->is_buf_ring)
return -EINVAL;
- __io_remove_buffers(ctx, bl, -1U);
xa_erase(&ctx->io_bl_xa, bl->bgid);
- kfree_rcu(bl, rcu);
+ io_put_bl(ctx, bl);
return 0;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index fdbb10449513..8b868a1744e2 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -25,6 +25,8 @@ struct io_buffer_list {
__u16 head;
__u16 mask;
+ atomic_t refs;
+
/* ring mapped provided buffers */
__u8 is_buf_ring;
/* ring mapped provided buffers, but mmap'ed by application */
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 09/11] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (7 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 08/11] io_uring/kbuf: protect io_buffer_list teardown with a reference Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 10/11] io_uring: use unpin_user_pages() where appropriate Jens Axboe
2024-03-28 23:31 ` [PATCH 11/11] io_uring: move mapping/allocation helpers to a separate file Jens Axboe
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.
This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/io_uring_types.h | 3 -
io_uring/io_uring.c | 69 +++++--------
io_uring/io_uring.h | 6 +-
io_uring/kbuf.c | 171 +++++++++++----------------------
io_uring/kbuf.h | 7 +-
5 files changed, 85 insertions(+), 171 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 8c64c303dee8..aeb4639785b5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -372,9 +372,6 @@ struct io_ring_ctx {
struct list_head io_buffers_cache;
- /* deferred free list, protected by ->uring_lock */
- struct hlist_head io_buf_list;
-
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 17db5b2aa4b5..83f63630365a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -302,7 +302,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
- INIT_HLIST_HEAD(&ctx->io_buf_list);
ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
@@ -2592,12 +2591,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_pages_unmap(void *ptr, struct page ***pages,
- unsigned short *npages)
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages)
{
bool do_vunmap = false;
- if (*npages) {
+ if (put_pages && *npages) {
struct page **to_free = *pages;
int i;
@@ -2619,14 +2618,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages,
*npages = 0;
}
-void io_mem_free(void *ptr)
-{
- if (!ptr)
- return;
-
- folio_put(virt_to_folio(ptr));
-}
-
static void io_pages_free(struct page ***pages, int npages)
{
struct page **page_array = *pages;
@@ -2721,8 +2712,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
static void io_rings_free(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
- io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
+ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
+ true);
+ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
+ true);
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
@@ -2779,8 +2772,8 @@ static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
return ERR_PTR(-ENOMEM);
}
-static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
- size_t size)
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
struct page **pages;
@@ -2810,17 +2803,6 @@ static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
return ERR_PTR(-ENOMEM);
}
-void *io_mem_alloc(size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
- void *ret;
-
- ret = (void *) __get_free_pages(gfp, get_order(size));
- if (ret)
- return ret;
- return ERR_PTR(-ENOMEM);
-}
-
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{
@@ -2917,7 +2899,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
- io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
@@ -3387,10 +3368,8 @@ static void *io_uring_validate_mmap_request(struct file *file,
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
- struct page *page;
- void *ptr;
- switch (offset & IORING_OFF_MMAP_MASK) {
+ switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */
@@ -3403,25 +3382,21 @@ static void *io_uring_validate_mmap_request(struct file *file,
return ERR_PTR(-EINVAL);
return ctx->sq_sqes;
case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
unsigned int bgid;
+ void *ret;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- rcu_read_lock();
- ptr = io_pbuf_get_address(ctx, bgid);
- rcu_read_unlock();
- if (!ptr)
- return ERR_PTR(-EINVAL);
- break;
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ret = bl->buf_ring;
+ io_put_bl(ctx, bl);
+ return ret;
}
- default:
- return ERR_PTR(-EINVAL);
}
- page = virt_to_head_page(ptr);
- if (sz > page_size(page))
- return ERR_PTR(-EINVAL);
-
- return ptr;
+ return ERR_PTR(-EINVAL);
}
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
@@ -3440,7 +3415,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
long offset = vma->vm_pgoff << PAGE_SHIFT;
- unsigned long pfn;
void *ptr;
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
@@ -3455,10 +3429,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
case IORING_OFF_SQES:
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
ctx->n_sqe_pages);
+ case IORING_OFF_PBUF_RING:
+ return io_pbuf_mmap(file, vma);
}
- pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+ return -EINVAL;
}
static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 75230d914007..dec996a1c789 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -109,8 +109,10 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
-void *io_mem_alloc(size_t size);
-void io_mem_free(void *ptr);
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size);
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages);
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 206f4d352e15..99b349930a1a 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -32,25 +32,12 @@ struct io_provide_buf {
__u16 bid;
};
-struct io_buf_free {
- struct hlist_node list;
- void *mem;
- size_t size;
- int inuse;
-};
-
-static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
- unsigned int bgid)
-{
- return xa_load(&ctx->io_bl_xa, bgid);
-}
-
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
lockdep_assert_held(&ctx->uring_lock);
- return __io_buffer_get_list(ctx, bgid);
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret;
}
-/*
- * Mark the given mapped range as free for reuse
- */
-static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
-{
- struct io_buf_free *ibf;
-
- hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
- if (bl->buf_ring == ibf->mem) {
- ibf->inuse = 0;
- return;
- }
- }
-
- /* can't happen... */
- WARN_ON_ONCE(1);
-}
-
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
@@ -220,23 +189,18 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_buf_ring) {
i = bl->buf_ring->tail - bl->head;
- if (bl->is_mmap) {
- /*
- * io_kbuf_list_free() will free the page(s) at
- * ->release() time.
- */
- io_kbuf_mark_free(ctx, bl);
- bl->buf_ring = NULL;
- bl->is_mmap = 0;
- } else if (bl->buf_nr_pages) {
+ if (bl->buf_nr_pages) {
int j;
- for (j = 0; j < bl->buf_nr_pages; j++)
- unpin_user_page(bl->buf_pages[j]);
- kvfree(bl->buf_pages);
- vunmap(bl->buf_ring);
- bl->buf_pages = NULL;
- bl->buf_nr_pages = 0;
+ for (j = 0; j < bl->buf_nr_pages; j++) {
+ if (bl->is_mmap)
+ put_page(bl->buf_pages[j]);
+ else
+ unpin_user_page(bl->buf_pages[j]);
+ }
+ io_pages_unmap(bl->buf_ring, &bl->buf_pages,
+ &bl->buf_nr_pages, false);
+ bl->is_mmap = 0;
}
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
@@ -260,7 +224,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
-static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
if (atomic_dec_and_test(&bl->refs)) {
__io_remove_buffers(ctx, bl, -1U);
@@ -537,63 +501,18 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
return ret;
}
-/*
- * See if we have a suitable region that we can reuse, rather than allocate
- * both a new io_buf_free and mem region again. We leave it on the list as
- * even a reused entry will need freeing at ring release.
- */
-static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
- size_t ring_size)
-{
- struct io_buf_free *ibf, *best = NULL;
- size_t best_dist;
-
- hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
- size_t dist;
-
- if (ibf->inuse || ibf->size < ring_size)
- continue;
- dist = ibf->size - ring_size;
- if (!best || dist < best_dist) {
- best = ibf;
- if (!dist)
- break;
- best_dist = dist;
- }
- }
-
- return best;
-}
-
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- struct io_buf_free *ibf;
size_t ring_size;
- void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
- /* Reuse existing entry, if we can */
- ibf = io_lookup_buf_free_entry(ctx, ring_size);
- if (!ibf) {
- ptr = io_mem_alloc(ring_size);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- /* Allocate and store deferred free entry */
- ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
- if (!ibf) {
- io_mem_free(ptr);
- return -ENOMEM;
- }
- ibf->mem = ptr;
- ibf->size = ring_size;
- hlist_add_head(&ibf->list, &ctx->io_buf_list);
- }
- ibf->inuse = 1;
- bl->buf_ring = ibf->mem;
+ bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
+ if (!bl->buf_ring)
+ return -ENOMEM;
+
bl->is_buf_ring = 1;
bl->is_mmap = 1;
return 0;
@@ -710,30 +629,50 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
+struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+ unsigned long bgid)
{
struct io_buffer_list *bl;
+ int ret;
- bl = __io_buffer_get_list(ctx, bgid);
-
- if (!bl || !bl->is_mmap)
- return NULL;
-
- return bl->buf_ring;
+ /*
+ * We have to be a bit careful here - we're inside mmap and cannot
+ * grab the uring_lock. This means the buffer_list could be
+ * simultaneously going away, if someone is trying to be sneaky.
+ * Look it up under rcu so we now it's not going away, and attempt
+ * to grab a reference to it. If the ref is already zero, then fail
+ * the mapping. If successful, we'll drop the reference at at the end.
+ * This may then safely free the buffer_list (and drop the pages) at
+ * that point, vm_insert_pages() would've already grabbed the
+ * necessary vma references.
+ */
+ rcu_read_lock();
+ bl = xa_load(&ctx->io_bl_xa, bgid);
+ /* must be a mmap'able buffer ring and have pages */
+ if (bl && bl->is_mmap && bl->buf_nr_pages)
+ ret = atomic_inc_not_zero(&bl->refs);
+ rcu_read_unlock();
+
+ if (!ret)
+ return ERR_PTR(-EINVAL);
+
+ return bl;
}
-/*
- * Called at or after ->release(), free the mmap'ed buffers that we used
- * for memory mapped provided buffer rings.
- */
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct io_buf_free *ibf;
- struct hlist_node *tmp;
+ struct io_ring_ctx *ctx = file->private_data;
+ loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
+ struct io_buffer_list *bl;
+ int bgid, ret;
- hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
- hlist_del(&ibf->list);
- io_mem_free(ibf->mem);
- kfree(ibf);
- }
+ bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return PTR_ERR(bl);
+
+ ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
+ io_put_bl(ctx, bl);
+ return ret;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 8b868a1744e2..53c141d9a8b2 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,13 +55,14 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
-
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
+void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
+struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
+ unsigned long bgid);
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 10/11] io_uring: use unpin_user_pages() where appropriate
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (8 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 09/11] io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
2024-03-28 23:31 ` [PATCH 11/11] io_uring: move mapping/allocation helpers to a separate file Jens Axboe
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
There are a few cases of open-rolled loops around unpin_user_page(), use
the generic helper instead.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/io_uring.c | 4 +---
io_uring/kbuf.c | 5 ++---
2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 83f63630365a..00b98e80f8ca 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2621,13 +2621,11 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
static void io_pages_free(struct page ***pages, int npages)
{
struct page **page_array = *pages;
- int i;
if (!page_array)
return;
- for (i = 0; i < npages; i++)
- unpin_user_page(page_array[i]);
+ unpin_user_pages(page_array, npages);
kvfree(page_array);
*pages = NULL;
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 99b349930a1a..3ba576ccb1d9 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -457,8 +457,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
struct io_uring_buf_ring *br = NULL;
- int nr_pages, ret, i;
struct page **pages;
+ int nr_pages, ret;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
@@ -494,8 +494,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
bl->is_mmap = 0;
return 0;
error_unpin:
- for (i = 0; i < nr_pages; i++)
- unpin_user_page(pages[i]);
+ unpin_user_pages(pages, nr_pages);
kvfree(pages);
vunmap(br);
return ret;
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 11/11] io_uring: move mapping/allocation helpers to a separate file
2024-03-28 23:31 [PATCHSET v3 0/11] Move away from remap_pfn_range() Jens Axboe
` (9 preceding siblings ...)
2024-03-28 23:31 ` [PATCH 10/11] io_uring: use unpin_user_pages() where appropriate Jens Axboe
@ 2024-03-28 23:31 ` Jens Axboe
10 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2024-03-28 23:31 UTC (permalink / raw)
To: io-uring; +Cc: hannes, Jens Axboe
Move the related code from io_uring.c into memmap.c. No functional
changes in this patch, just cleaning it up a bit now that the full
transition is done.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/Makefile | 3 +-
io_uring/io_uring.c | 324 +-----------------------------------------
io_uring/io_uring.h | 9 --
io_uring/kbuf.c | 1 +
io_uring/memmap.c | 333 ++++++++++++++++++++++++++++++++++++++++++++
io_uring/memmap.h | 25 ++++
io_uring/rsrc.c | 1 +
7 files changed, 364 insertions(+), 332 deletions(-)
create mode 100644 io_uring/memmap.c
create mode 100644 io_uring/memmap.h
diff --git a/io_uring/Makefile b/io_uring/Makefile
index bd7c692a6a7c..fc1b23c524e8 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
xattr.o nop.o fs.o splice.o sync.o \
msg_ring.o advise.o openclose.o \
epoll.o statx.o timeout.o fdinfo.o \
- cancel.o waitid.o register.o truncate.o
+ cancel.o waitid.o register.o \
+ truncate.o memmap.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 00b98e80f8ca..fddaefb9cbff 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@
#include "futex.h"
#include "napi.h"
#include "uring_cmd.h"
+#include "memmap.h"
#include "timeout.h"
#include "poll.h"
@@ -2591,108 +2592,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
- bool put_pages)
-{
- bool do_vunmap = false;
-
- if (put_pages && *npages) {
- struct page **to_free = *pages;
- int i;
-
- /*
- * Only did vmap for the non-compound multiple page case.
- * For the compound page, we just need to put the head.
- */
- if (PageCompound(to_free[0]))
- *npages = 1;
- else if (*npages > 1)
- do_vunmap = true;
- for (i = 0; i < *npages; i++)
- put_page(to_free[i]);
- }
- if (do_vunmap)
- vunmap(ptr);
- kvfree(*pages);
- *pages = NULL;
- *npages = 0;
-}
-
-static void io_pages_free(struct page ***pages, int npages)
-{
- struct page **page_array = *pages;
-
- if (!page_array)
- return;
-
- unpin_user_pages(page_array, npages);
- kvfree(page_array);
- *pages = NULL;
-}
-
-struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
-{
- unsigned long start, end, nr_pages;
- struct page **pages;
- int ret;
-
- end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = uaddr >> PAGE_SHIFT;
- nr_pages = end - start;
- if (WARN_ON_ONCE(!nr_pages))
- return ERR_PTR(-EINVAL);
-
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages);
- /* success, mapped all pages */
- if (ret == nr_pages) {
- *npages = nr_pages;
- return pages;
- }
-
- /* partial map, or didn't map anything */
- if (ret >= 0) {
- /* if we did partial map, release any pages we did get */
- if (ret)
- unpin_user_pages(pages, ret);
- ret = -EFAULT;
- }
- kvfree(pages);
- return ERR_PTR(ret);
-}
-
-static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
- unsigned long uaddr, size_t size)
-{
- struct page **page_array;
- unsigned int nr_pages;
- void *page_addr;
-
- *npages = 0;
-
- if (uaddr & (PAGE_SIZE - 1) || !size)
- return ERR_PTR(-EINVAL);
-
- nr_pages = 0;
- page_array = io_pin_pages(uaddr, size, &nr_pages);
- if (IS_ERR(page_array))
- return page_array;
-
- page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
- if (page_addr) {
- *pages = page_array;
- *npages = nr_pages;
- return page_addr;
- }
-
- io_pages_free(&page_array, nr_pages);
- return ERR_PTR(-ENOMEM);
-}
-
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
size_t size)
{
@@ -2727,80 +2626,6 @@ static void io_rings_free(struct io_ring_ctx *ctx)
ctx->sq_sqes = NULL;
}
-static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
- size_t size, gfp_t gfp)
-{
- struct page *page;
- int i, order;
-
- order = get_order(size);
- if (order > MAX_PAGE_ORDER)
- return NULL;
- else if (order)
- gfp |= __GFP_COMP;
-
- page = alloc_pages(gfp, order);
- if (!page)
- return NULL;
-
- for (i = 0; i < nr_pages; i++)
- pages[i] = page + i;
-
- return page_address(page);
-}
-
-static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
- gfp_t gfp)
-{
- void *ret;
- int i;
-
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(gfp);
- if (!pages[i])
- goto err;
- }
-
- ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (ret)
- return ret;
-err:
- while (i--)
- put_page(pages[i]);
- return ERR_PTR(-ENOMEM);
-}
-
-void *io_pages_map(struct page ***out_pages, unsigned short *npages,
- size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
- struct page **pages;
- int nr_pages;
- void *ret;
-
- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
- if (ret)
- goto done;
-
- ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
- if (ret) {
-done:
- *out_pages = pages;
- *npages = nr_pages;
- return ret;
- }
-
- kvfree(pages);
- *out_pages = NULL;
- *npages = 0;
- return ERR_PTR(-ENOMEM);
-}
-
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{
@@ -3361,149 +3186,6 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
-static void *io_uring_validate_mmap_request(struct file *file,
- loff_t pgoff, size_t sz)
-{
- struct io_ring_ctx *ctx = file->private_data;
- loff_t offset = pgoff << PAGE_SHIFT;
-
- switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
- case IORING_OFF_SQ_RING:
- case IORING_OFF_CQ_RING:
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
- return ctx->rings;
- case IORING_OFF_SQES:
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
- return ctx->sq_sqes;
- case IORING_OFF_PBUF_RING: {
- struct io_buffer_list *bl;
- unsigned int bgid;
- void *ret;
-
- bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- bl = io_pbuf_get_bl(ctx, bgid);
- if (IS_ERR(bl))
- return bl;
- ret = bl->buf_ring;
- io_put_bl(ctx, bl);
- return ret;
- }
- }
-
- return ERR_PTR(-EINVAL);
-}
-
-int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
- struct page **pages, int npages)
-{
- unsigned long nr_pages = npages;
-
- vm_flags_set(vma, VM_DONTEXPAND);
- return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
-}
-
-#ifdef CONFIG_MMU
-
-static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct io_ring_ctx *ctx = file->private_data;
- size_t sz = vma->vm_end - vma->vm_start;
- long offset = vma->vm_pgoff << PAGE_SHIFT;
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- switch (offset & IORING_OFF_MMAP_MASK) {
- case IORING_OFF_SQ_RING:
- case IORING_OFF_CQ_RING:
- return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
- ctx->n_ring_pages);
- case IORING_OFF_SQES:
- return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
- ctx->n_sqe_pages);
- case IORING_OFF_PBUF_RING:
- return io_pbuf_mmap(file, vma);
- }
-
- return -EINVAL;
-}
-
-static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- void *ptr;
-
- /*
- * Do not allow to map to user-provided address to avoid breaking the
- * aliasing rules. Userspace is not able to guess the offset address of
- * kernel kmalloc()ed memory area.
- */
- if (addr)
- return -EINVAL;
-
- ptr = io_uring_validate_mmap_request(filp, pgoff, len);
- if (IS_ERR(ptr))
- return -ENOMEM;
-
- /*
- * Some architectures have strong cache aliasing requirements.
- * For such architectures we need a coherent mapping which aliases
- * kernel memory *and* userspace memory. To achieve that:
- * - use a NULL file pointer to reference physical memory, and
- * - use the kernel virtual address of the shared io_uring context
- * (instead of the userspace-provided address, which has to be 0UL
- * anyway).
- * - use the same pgoff which the get_unmapped_area() uses to
- * calculate the page colouring.
- * For architectures without such aliasing requirements, the
- * architecture will return any suitable mapping because addr is 0.
- */
- filp = NULL;
- flags |= MAP_SHARED;
- pgoff = 0; /* has been translated to ptr above */
-#ifdef SHM_COLOUR
- addr = (uintptr_t) ptr;
- pgoff = addr >> PAGE_SHIFT;
-#else
- addr = 0UL;
-#endif
- return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
-}
-
-#else /* !CONFIG_MMU */
-
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
-}
-
-static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
-{
- return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
-}
-
-static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, pgoff, len);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- return (unsigned long) ptr;
-}
-
-#endif /* !CONFIG_MMU */
-
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
if (flags & IORING_ENTER_EXT_ARG) {
@@ -3686,11 +3368,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.mmap = io_uring_mmap,
+ .get_unmapped_area = io_uring_get_unmapped_area,
#ifndef CONFIG_MMU
- .get_unmapped_area = io_uring_nommu_get_unmapped_area,
.mmap_capabilities = io_uring_nommu_mmap_capabilities,
-#else
- .get_unmapped_area = io_uring_mmu_get_unmapped_area,
#endif
.poll = io_uring_poll,
#ifdef CONFIG_PROC_FS
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dec996a1c789..1eb65324792a 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -69,10 +69,6 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
-int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
- struct page **pages, int npages);
-
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
@@ -109,11 +105,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
-void *io_pages_map(struct page ***out_pages, unsigned short *npages,
- size_t size);
-void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
- bool put_pages);
-
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3ba576ccb1d9..96dd8d05c754 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -15,6 +15,7 @@
#include "io_uring.h"
#include "opdef.h"
#include "kbuf.h"
+#include "memmap.h"
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
new file mode 100644
index 000000000000..acf5e8ca6b28
--- /dev/null
+++ b/io_uring/memmap.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/io_uring.h>
+#include <linux/io_uring_types.h>
+#include <asm/shmparam.h>
+
+#include "memmap.h"
+#include "kbuf.h"
+
+static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
+ size_t size, gfp_t gfp)
+{
+ struct page *page;
+ int i, order;
+
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return NULL;
+ else if (order)
+ gfp |= __GFP_COMP;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++)
+ pages[i] = page + i;
+
+ return page_address(page);
+}
+
+static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
+ gfp_t gfp)
+{
+ void *ret;
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(gfp);
+ if (!pages[i])
+ goto err;
+ }
+
+ ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (ret)
+ return ret;
+err:
+ while (i--)
+ put_page(pages[i]);
+ return ERR_PTR(-ENOMEM);
+}
+
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ struct page **pages;
+ int nr_pages;
+ void *ret;
+
+ nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
+ if (ret)
+ goto done;
+
+ ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
+ if (ret) {
+done:
+ *out_pages = pages;
+ *npages = nr_pages;
+ return ret;
+ }
+
+ kvfree(pages);
+ *out_pages = NULL;
+ *npages = 0;
+ return ERR_PTR(-ENOMEM);
+}
+
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages)
+{
+ bool do_vunmap = false;
+
+ if (put_pages && *npages) {
+ struct page **to_free = *pages;
+ int i;
+
+ /*
+ * Only did vmap for the non-compound multiple page case.
+ * For the compound page, we just need to put the head.
+ */
+ if (PageCompound(to_free[0]))
+ *npages = 1;
+ else if (*npages > 1)
+ do_vunmap = true;
+ for (i = 0; i < *npages; i++)
+ put_page(to_free[i]);
+ }
+ if (do_vunmap)
+ vunmap(ptr);
+ kvfree(*pages);
+ *pages = NULL;
+ *npages = 0;
+}
+
+void io_pages_free(struct page ***pages, int npages)
+{
+ struct page **page_array = *pages;
+
+ if (!page_array)
+ return;
+
+ unpin_user_pages(page_array, npages);
+ kvfree(page_array);
+ *pages = NULL;
+}
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+{
+ unsigned long start, end, nr_pages;
+ struct page **pages;
+ int ret;
+
+ end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = uaddr >> PAGE_SHIFT;
+ nr_pages = end - start;
+ if (WARN_ON_ONCE(!nr_pages))
+ return ERR_PTR(-EINVAL);
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
+ *npages = nr_pages;
+ return pages;
+ }
+
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
+ /* if we did partial map, release any pages we did get */
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
+ }
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+ unsigned long uaddr, size_t size)
+{
+ struct page **page_array;
+ unsigned int nr_pages;
+ void *page_addr;
+
+ *npages = 0;
+
+ if (uaddr & (PAGE_SIZE - 1) || !size)
+ return ERR_PTR(-EINVAL);
+
+ nr_pages = 0;
+ page_array = io_pin_pages(uaddr, size, &nr_pages);
+ if (IS_ERR(page_array))
+ return page_array;
+
+ page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (page_addr) {
+ *pages = page_array;
+ *npages = nr_pages;
+ return page_addr;
+ }
+
+ io_pages_free(&page_array, nr_pages);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
+ size_t sz)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ loff_t offset = pgoff << PAGE_SHIFT;
+
+ switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
+ case IORING_OFF_SQ_RING:
+ case IORING_OFF_CQ_RING:
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return ERR_PTR(-EINVAL);
+ return ctx->rings;
+ case IORING_OFF_SQES:
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return ERR_PTR(-EINVAL);
+ return ctx->sq_sqes;
+ case IORING_OFF_PBUF_RING: {
+ struct io_buffer_list *bl;
+ unsigned int bgid;
+ void *ret;
+
+ bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+ bl = io_pbuf_get_bl(ctx, bgid);
+ if (IS_ERR(bl))
+ return bl;
+ ret = bl->buf_ring;
+ io_put_bl(ctx, bl);
+ return ret;
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages)
+{
+ unsigned long nr_pages = npages;
+
+ vm_flags_set(vma, VM_DONTEXPAND);
+ return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+}
+
+#ifdef CONFIG_MMU
+
+__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ size_t sz = vma->vm_end - vma->vm_start;
+ long offset = vma->vm_pgoff << PAGE_SHIFT;
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ switch (offset & IORING_OFF_MMAP_MASK) {
+ case IORING_OFF_SQ_RING:
+ case IORING_OFF_CQ_RING:
+ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
+ ctx->n_ring_pages);
+ case IORING_OFF_SQES:
+ return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
+ ctx->n_sqe_pages);
+ case IORING_OFF_PBUF_RING:
+ return io_pbuf_mmap(file, vma);
+ }
+
+ return -EINVAL;
+}
+
+unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ void *ptr;
+
+ /*
+ * Do not allow to map to user-provided address to avoid breaking the
+ * aliasing rules. Userspace is not able to guess the offset address of
+ * kernel kmalloc()ed memory area.
+ */
+ if (addr)
+ return -EINVAL;
+
+ ptr = io_uring_validate_mmap_request(filp, pgoff, len);
+ if (IS_ERR(ptr))
+ return -ENOMEM;
+
+ /*
+ * Some architectures have strong cache aliasing requirements.
+ * For such architectures we need a coherent mapping which aliases
+ * kernel memory *and* userspace memory. To achieve that:
+ * - use a NULL file pointer to reference physical memory, and
+ * - use the kernel virtual address of the shared io_uring context
+ * (instead of the userspace-provided address, which has to be 0UL
+ * anyway).
+ * - use the same pgoff which the get_unmapped_area() uses to
+ * calculate the page colouring.
+ * For architectures without such aliasing requirements, the
+ * architecture will return any suitable mapping because addr is 0.
+ */
+ filp = NULL;
+ flags |= MAP_SHARED;
+ pgoff = 0; /* has been translated to ptr above */
+#ifdef SHM_COLOUR
+ addr = (uintptr_t) ptr;
+ pgoff = addr >> PAGE_SHIFT;
+#else
+ addr = 0UL;
+#endif
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+#else /* !CONFIG_MMU */
+
+int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
+}
+
+unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+}
+
+unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, pgoff, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ return (unsigned long) ptr;
+}
+
+#endif /* !CONFIG_MMU */
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
new file mode 100644
index 000000000000..5cec5b7ac49a
--- /dev/null
+++ b/io_uring/memmap.h
@@ -0,0 +1,25 @@
+#ifndef IO_URING_MEMMAP_H
+#define IO_URING_MEMMAP_H
+
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+void io_pages_free(struct page ***pages, int npages);
+int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
+ struct page **pages, int npages);
+
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size);
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages);
+
+void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+ unsigned long uaddr, size_t size);
+
+#ifndef CONFIG_MMU
+unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
+#endif
+unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags);
+int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 8a34181c97ab..65417c9553b1 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -16,6 +16,7 @@
#include "alloc_cache.h"
#include "openclose.h"
#include "rsrc.h"
+#include "memmap.h"
struct io_rsrc_update {
struct file *file;
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread