public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/8] zcrx huge pages support Vol 1
@ 2025-06-17 14:48 Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 1/8] io_uring/zcrx: return error from io_zcrx_map_area_* Pavel Begunkov
                   ` (8 more replies)
  0 siblings, 9 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Deduplicate some umem vs dmabuf code by creating sg_table for umem areas,
and add huge page coalescing on top. It improves iommu mapping and
compacts the page array, but leaves optimising the NIC page sizes to
follow ups.

Pavel Begunkov (8):
  io_uring/zcrx: return error from io_zcrx_map_area_*
  io_uring/zcrx: introduce io_populate_area_dma
  io_uring/zcrx: allocate sgtable for umem areas
  io_uring/zcrx: assert area type in io_zcrx_iov_page
  io_uring/zcrx: convert io_zcrx_iov_page to use folios
  io_uring/zcrx: add infra for large pages
  io_uring: export io_coalesce_buffer()
  io_uring/zcrx: try to coalesce area pages

 io_uring/rsrc.c |   2 +-
 io_uring/rsrc.h |   2 +
 io_uring/zcrx.c | 179 +++++++++++++++++++++++++-----------------------
 io_uring/zcrx.h |   2 +
 4 files changed, 98 insertions(+), 87 deletions(-)

-- 
2.49.0


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/8] io_uring/zcrx: return error from io_zcrx_map_area_*
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 2/8] io_uring/zcrx: introduce io_populate_area_dma Pavel Begunkov
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

io_zcrx_map_area_*() helpers return the number of processed niovs, which
we use to unroll some of the mappings for user memory areas. It's
unhandy, and dmabuf doesn't care about it. Return an error code instead
and move failure partial unmapping into io_zcrx_map_area_umem().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 797247a34cb7..7d361b661ccb 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -139,13 +139,13 @@ static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area
 			struct net_iov *niov = &area->nia.niovs[niov_idx];
 
 			if (net_mp_niov_set_dma_addr(niov, dma))
-				return 0;
+				return -EFAULT;
 			sg_len -= PAGE_SIZE;
 			dma += PAGE_SIZE;
 			niov_idx++;
 		}
 	}
-	return niov_idx;
+	return 0;
 }
 
 static int io_import_umem(struct io_zcrx_ifq *ifq,
@@ -254,29 +254,30 @@ static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *a
 			break;
 		}
 	}
-	return i;
+
+	if (i != area->nia.num_niovs) {
+		__io_zcrx_unmap_area(ifq, area, i);
+		return -EINVAL;
+	}
+	return 0;
 }
 
 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
-	unsigned nr;
+	int ret;
 
 	guard(mutex)(&ifq->dma_lock);
 	if (area->is_mapped)
 		return 0;
 
 	if (area->mem.is_dmabuf)
-		nr = io_zcrx_map_area_dmabuf(ifq, area);
+		ret = io_zcrx_map_area_dmabuf(ifq, area);
 	else
-		nr = io_zcrx_map_area_umem(ifq, area);
+		ret = io_zcrx_map_area_umem(ifq, area);
 
-	if (nr != area->nia.num_niovs) {
-		__io_zcrx_unmap_area(ifq, area, nr);
-		return -EINVAL;
-	}
-
-	area->is_mapped = true;
-	return 0;
+	if (ret == 0)
+		area->is_mapped = true;
+	return ret;
 }
 
 static void io_zcrx_sync_for_device(const struct page_pool *pool,
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/8] io_uring/zcrx: introduce io_populate_area_dma
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 1/8] io_uring/zcrx: return error from io_zcrx_map_area_* Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 3/8] io_uring/zcrx: allocate sgtable for umem areas Pavel Begunkov
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Add a helper that initialises page-pool dma addresses from a sg table.
It'll be reused in following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 56 +++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 7d361b661ccb..2b82fda6d934 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -47,6 +47,35 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 	return area->mem.pages[net_iov_idx(niov)];
 }
 
+static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
+				struct io_zcrx_area *area,
+				struct sg_table *sgt, unsigned long off)
+{
+	struct scatterlist *sg;
+	unsigned i, niov_idx = 0;
+
+	for_each_sgtable_dma_sg(sgt, sg, i) {
+		dma_addr_t dma = sg_dma_address(sg);
+		unsigned long sg_len = sg_dma_len(sg);
+		unsigned long sg_off = min(sg_len, off);
+
+		off -= sg_off;
+		sg_len -= sg_off;
+		dma += sg_off;
+
+		while (sg_len && niov_idx < area->nia.num_niovs) {
+			struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+			if (net_mp_niov_set_dma_addr(niov, dma))
+				return -EFAULT;
+			sg_len -= PAGE_SIZE;
+			dma += PAGE_SIZE;
+			niov_idx++;
+		}
+	}
+	return 0;
+}
+
 static void io_release_dmabuf(struct io_zcrx_mem *mem)
 {
 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
@@ -119,33 +148,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
 
 static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
-	unsigned long off = area->mem.dmabuf_offset;
-	struct scatterlist *sg;
-	unsigned i, niov_idx = 0;
-
 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
 		return -EINVAL;
-
-	for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
-		dma_addr_t dma = sg_dma_address(sg);
-		unsigned long sg_len = sg_dma_len(sg);
-		unsigned long sg_off = min(sg_len, off);
-
-		off -= sg_off;
-		sg_len -= sg_off;
-		dma += sg_off;
-
-		while (sg_len && niov_idx < area->nia.num_niovs) {
-			struct net_iov *niov = &area->nia.niovs[niov_idx];
-
-			if (net_mp_niov_set_dma_addr(niov, dma))
-				return -EFAULT;
-			sg_len -= PAGE_SIZE;
-			dma += PAGE_SIZE;
-			niov_idx++;
-		}
-	}
-	return 0;
+	return io_populate_area_dma(ifq, area, area->mem.sgt,
+				    area->mem.dmabuf_offset);
 }
 
 static int io_import_umem(struct io_zcrx_ifq *ifq,
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/8] io_uring/zcrx: allocate sgtable for umem areas
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 1/8] io_uring/zcrx: return error from io_zcrx_map_area_* Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 2/8] io_uring/zcrx: introduce io_populate_area_dma Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 4/8] io_uring/zcrx: assert area type in io_zcrx_iov_page Pavel Begunkov
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Currently, dma addresses for umem areas are stored directly in niovs.
It's memory efficient but inconvenient. I need a better format 1) to
share code with dmabuf areas, and 2) for disentangling page, folio and
niov sizes. dmabuf already provides sg_table, create one for user memory
as well.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 80 ++++++++++++++++++-------------------------------
 io_uring/zcrx.h |  1 +
 2 files changed, 30 insertions(+), 51 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 2b82fda6d934..91e795e6ae1a 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -158,8 +158,10 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
 			  struct io_zcrx_mem *mem,
 			  struct io_uring_zcrx_area_reg *area_reg)
 {
+	struct scatterlist *sgl;
 	struct page **pages;
-	int nr_pages;
+	int nr_pages, ret;
+	unsigned i = 0;
 
 	if (area_reg->dmabuf_fd)
 		return -EINVAL;
@@ -170,6 +172,12 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
+	ret = sg_alloc_table(&mem->page_sg_table, nr_pages, GFP_KERNEL_ACCOUNT);
+	if (ret)
+		return ret;
+	for_each_sg(mem->page_sg_table.sgl, sgl, nr_pages, i)
+		sg_set_page(sgl, pages[i], PAGE_SIZE, 0);
+
 	mem->pages = pages;
 	mem->nr_folios = nr_pages;
 	mem->size = area_reg->len;
@@ -184,6 +192,7 @@ static void io_release_area_mem(struct io_zcrx_mem *mem)
 	}
 	if (mem->pages) {
 		unpin_user_pages(mem->pages, mem->nr_folios);
+		sg_free_table(&mem->page_sg_table);
 		kvfree(mem->pages);
 	}
 }
@@ -205,67 +214,36 @@ static int io_import_area(struct io_zcrx_ifq *ifq,
 	return io_import_umem(ifq, mem, area_reg);
 }
 
-static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
-				struct io_zcrx_area *area, int nr_mapped)
-{
-	int i;
-
-	for (i = 0; i < nr_mapped; i++) {
-		netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
-		dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
-
-		dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
-				     DMA_FROM_DEVICE, IO_DMA_ATTR);
-	}
-}
-
-static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
-				 struct io_zcrx_area *area, int nr_mapped)
+static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
+				struct io_zcrx_area *area)
 {
 	int i;
 
-	if (area->mem.is_dmabuf)
-		io_release_dmabuf(&area->mem);
-	else
-		io_zcrx_unmap_umem(ifq, area, nr_mapped);
+	guard(mutex)(&ifq->dma_lock);
+	if (!area->is_mapped)
+		return;
+	area->is_mapped = false;
 
 	for (i = 0; i < area->nia.num_niovs; i++)
 		net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
-}
-
-static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
-{
-	guard(mutex)(&ifq->dma_lock);
 
-	if (area->is_mapped)
-		__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
-	area->is_mapped = false;
+	if (area->mem.is_dmabuf) {
+		io_release_dmabuf(&area->mem);
+	} else {
+		dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
+				  DMA_FROM_DEVICE, IO_DMA_ATTR);
+	}
 }
 
-static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
 {
-	int i;
-
-	for (i = 0; i < area->nia.num_niovs; i++) {
-		struct net_iov *niov = &area->nia.niovs[i];
-		dma_addr_t dma;
-
-		dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
-					 PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
-		if (dma_mapping_error(ifq->dev, dma))
-			break;
-		if (net_mp_niov_set_dma_addr(niov, dma)) {
-			dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
-					     DMA_FROM_DEVICE, IO_DMA_ATTR);
-			break;
-		}
-	}
+	int ret;
 
-	if (i != area->nia.num_niovs) {
-		__io_zcrx_unmap_area(ifq, area, i);
-		return -EINVAL;
-	}
-	return 0;
+	ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
+				DMA_FROM_DEVICE, IO_DMA_ATTR);
+	if (ret < 0)
+		return ret;
+	return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0);
 }
 
 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 2f5e26389f22..89015b923911 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -14,6 +14,7 @@ struct io_zcrx_mem {
 
 	struct page			**pages;
 	unsigned long			nr_folios;
+	struct sg_table			page_sg_table;
 
 	struct dma_buf_attachment	*attach;
 	struct dma_buf			*dmabuf;
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/8] io_uring/zcrx: assert area type in io_zcrx_iov_page
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
                   ` (2 preceding siblings ...)
  2025-06-17 14:48 ` [PATCH 3/8] io_uring/zcrx: allocate sgtable for umem areas Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 5/8] io_uring/zcrx: convert io_zcrx_iov_page to use folios Pavel Begunkov
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Add a simple debug assertion to io_zcrx_iov_page() making it's not
trying to return pages for a dmabuf area.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 91e795e6ae1a..a2493798e6f8 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -44,6 +44,8 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 {
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
 
+	lockdep_assert(!area->mem.is_dmabuf);
+
 	return area->mem.pages[net_iov_idx(niov)];
 }
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 5/8] io_uring/zcrx: convert io_zcrx_iov_page to use folios
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
                   ` (3 preceding siblings ...)
  2025-06-17 14:48 ` [PATCH 4/8] io_uring/zcrx: assert area type in io_zcrx_iov_page Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 6/8] io_uring/zcrx: add infra for large pages Pavel Begunkov
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Modernise io_zcrx_iov_page() and make it return folios + offset. That
will be used for the next patches implementing area's page coalescing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index a2493798e6f8..a83b80c16491 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -40,13 +40,17 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio
 	return container_of(owner, struct io_zcrx_area, nia);
 }
 
-static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+static inline struct folio *io_niov_folio(const struct net_iov *niov,
+					  unsigned long *off)
 {
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+	struct page *page;
 
 	lockdep_assert(!area->mem.is_dmabuf);
 
-	return area->mem.pages[net_iov_idx(niov)];
+	page = area->mem.pages[net_iov_idx(niov)];
+	*off = (page - compound_head(page)) << PAGE_SHIFT;
+	return page_folio(page);
 }
 
 static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
@@ -944,7 +948,8 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 		size_t copy_size = min_t(size_t, PAGE_SIZE, len);
 		const int dst_off = 0;
 		struct net_iov *niov;
-		struct page *dst_page;
+		struct folio *dst_folio;
+		unsigned long dst_folio_off;
 		void *dst_addr;
 
 		niov = io_zcrx_alloc_fallback(area);
@@ -953,8 +958,8 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
 			break;
 		}
 
-		dst_page = io_zcrx_iov_page(niov);
-		dst_addr = kmap_local_page(dst_page);
+		dst_folio = io_niov_folio(niov, &dst_folio_off);
+		dst_addr = kmap_local_folio(dst_folio, dst_folio_off);
 		if (src_page)
 			src_base = kmap_local_page(src_page);
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 6/8] io_uring/zcrx: add infra for large pages
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
                   ` (4 preceding siblings ...)
  2025-06-17 14:48 ` [PATCH 5/8] io_uring/zcrx: convert io_zcrx_iov_page to use folios Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 7/8] io_uring: export io_coalesce_buffer() Pavel Begunkov
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Currently, zcrx assumes PAGE_SIZE pages, add infrastructure to support
uniformly sized higher order pages. sg_table hides most of the details
and the only place that need index recalculation is the copy fallback.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 13 ++++++++-----
 io_uring/zcrx.h |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index a83b80c16491..44b5f8084279 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -44,13 +44,14 @@ static inline struct folio *io_niov_folio(const struct net_iov *niov,
 					  unsigned long *off)
 {
 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-	struct page *page;
+	unsigned long niov_off, folio_idx;
 
 	lockdep_assert(!area->mem.is_dmabuf);
 
-	page = area->mem.pages[net_iov_idx(niov)];
-	*off = (page - compound_head(page)) << PAGE_SHIFT;
-	return page_folio(page);
+	niov_off = net_iov_idx(niov) << PAGE_SHIFT;
+	folio_idx = niov_off >> area->mem.folio_shift;
+	*off = niov_off - (folio_idx << area->mem.folio_shift);
+	return page_folio(area->mem.pages[folio_idx]);
 }
 
 static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
@@ -164,6 +165,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
 			  struct io_zcrx_mem *mem,
 			  struct io_uring_zcrx_area_reg *area_reg)
 {
+	unsigned folio_shift = PAGE_SHIFT;
 	struct scatterlist *sgl;
 	struct page **pages;
 	int nr_pages, ret;
@@ -182,11 +184,12 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
 	if (ret)
 		return ret;
 	for_each_sg(mem->page_sg_table.sgl, sgl, nr_pages, i)
-		sg_set_page(sgl, pages[i], PAGE_SIZE, 0);
+		sg_set_page(sgl, pages[i], 1U << folio_shift, 0);
 
 	mem->pages = pages;
 	mem->nr_folios = nr_pages;
 	mem->size = area_reg->len;
+	mem->folio_shift = folio_shift;
 	return 0;
 }
 
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 89015b923911..4f718b3088d9 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -14,6 +14,7 @@ struct io_zcrx_mem {
 
 	struct page			**pages;
 	unsigned long			nr_folios;
+	unsigned			folio_shift;
 	struct sg_table			page_sg_table;
 
 	struct dma_buf_attachment	*attach;
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 7/8] io_uring: export io_coalesce_buffer()
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
                   ` (5 preceding siblings ...)
  2025-06-17 14:48 ` [PATCH 6/8] io_uring/zcrx: add infra for large pages Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-17 14:48 ` [PATCH 8/8] io_uring/zcrx: try to coalesce area pages Pavel Begunkov
  2025-06-30 17:12 ` [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

We'll need io_coalesce_buffer() in the next patch for zcrx, export it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/rsrc.c | 2 +-
 io_uring/rsrc.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index c592ceace97d..bab0ea45046d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -684,7 +684,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
-static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
+bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
 				struct io_imu_folio_data *data)
 {
 	struct page **page_array = *pages, **new_array = NULL;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 0d2138f16322..4dcedfa69b8c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -87,6 +87,8 @@ int io_validate_user_buf_range(u64 uaddr, u64 ulen);
 
 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
 			      struct io_imu_folio_data *data);
+bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
+			struct io_imu_folio_data *data);
 
 static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
 						       int index)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 8/8] io_uring/zcrx: try to coalesce area pages
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
                   ` (6 preceding siblings ...)
  2025-06-17 14:48 ` [PATCH 7/8] io_uring: export io_coalesce_buffer() Pavel Begunkov
@ 2025-06-17 14:48 ` Pavel Begunkov
  2025-06-30 17:12 ` [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
  8 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-17 14:48 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, David Wei

Try to shrink the page array into fewer larger folios if possible. This
reduces the footprint, optimises dma mappings and will be used in the
future for further huge page optimisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/zcrx.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 44b5f8084279..9f81682ccf0c 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -166,6 +166,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
 			  struct io_uring_zcrx_area_reg *area_reg)
 {
 	unsigned folio_shift = PAGE_SHIFT;
+	struct io_imu_folio_data data;
 	struct scatterlist *sgl;
 	struct page **pages;
 	int nr_pages, ret;
@@ -180,6 +181,17 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
+	if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
+		/*
+		 * Only coalesce folio addr-aligned pages and when we can
+		 * improve the size.
+		 */
+		if (data.nr_pages_mid != 1 &&
+		    data.nr_pages_head == data.nr_pages_mid &&
+		    io_coalesce_buffer(&pages, &nr_pages, &data))
+			folio_shift = data.folio_shift;
+	}
+
 	ret = sg_alloc_table(&mem->page_sg_table, nr_pages, GFP_KERNEL_ACCOUNT);
 	if (ret)
 		return ret;
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/8] zcrx huge pages support Vol 1
  2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
                   ` (7 preceding siblings ...)
  2025-06-17 14:48 ` [PATCH 8/8] io_uring/zcrx: try to coalesce area pages Pavel Begunkov
@ 2025-06-30 17:12 ` Pavel Begunkov
  2025-06-30 17:31   ` Jens Axboe
  8 siblings, 1 reply; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-30 17:12 UTC (permalink / raw)
  To: io-uring; +Cc: David Wei

On 6/17/25 15:48, Pavel Begunkov wrote:
> Deduplicate some umem vs dmabuf code by creating sg_table for umem areas,
> and add huge page coalescing on top. It improves iommu mapping and
> compacts the page array, but leaves optimising the NIC page sizes to
> follow ups.

Any comments?

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/8] zcrx huge pages support Vol 1
  2025-06-30 17:12 ` [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
@ 2025-06-30 17:31   ` Jens Axboe
  2025-06-30 17:44     ` Pavel Begunkov
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2025-06-30 17:31 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: David Wei

On 6/30/25 11:12 AM, Pavel Begunkov wrote:
> On 6/17/25 15:48, Pavel Begunkov wrote:
>> Deduplicate some umem vs dmabuf code by creating sg_table for umem areas,
>> and add huge page coalescing on top. It improves iommu mapping and
>> compacts the page array, but leaves optimising the NIC page sizes to
>> follow ups.
> 
> Any comments?

I'll take a closer look tomorrow, it's on the backlog after being
away for 10 days.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/8] zcrx huge pages support Vol 1
  2025-06-30 17:31   ` Jens Axboe
@ 2025-06-30 17:44     ` Pavel Begunkov
  0 siblings, 0 replies; 12+ messages in thread
From: Pavel Begunkov @ 2025-06-30 17:44 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: David Wei

On 6/30/25 18:31, Jens Axboe wrote:
> On 6/30/25 11:12 AM, Pavel Begunkov wrote:
>> On 6/17/25 15:48, Pavel Begunkov wrote:
>>> Deduplicate some umem vs dmabuf code by creating sg_table for umem areas,
>>> and add huge page coalescing on top. It improves iommu mapping and
>>> compacts the page array, but leaves optimising the NIC page sizes to
>>> follow ups.
>>
>> Any comments?
> 
> I'll take a closer look tomorrow, it's on the backlog after being
> away for 10 days.

No worries

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2025-06-30 17:43 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-17 14:48 [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
2025-06-17 14:48 ` [PATCH 1/8] io_uring/zcrx: return error from io_zcrx_map_area_* Pavel Begunkov
2025-06-17 14:48 ` [PATCH 2/8] io_uring/zcrx: introduce io_populate_area_dma Pavel Begunkov
2025-06-17 14:48 ` [PATCH 3/8] io_uring/zcrx: allocate sgtable for umem areas Pavel Begunkov
2025-06-17 14:48 ` [PATCH 4/8] io_uring/zcrx: assert area type in io_zcrx_iov_page Pavel Begunkov
2025-06-17 14:48 ` [PATCH 5/8] io_uring/zcrx: convert io_zcrx_iov_page to use folios Pavel Begunkov
2025-06-17 14:48 ` [PATCH 6/8] io_uring/zcrx: add infra for large pages Pavel Begunkov
2025-06-17 14:48 ` [PATCH 7/8] io_uring: export io_coalesce_buffer() Pavel Begunkov
2025-06-17 14:48 ` [PATCH 8/8] io_uring/zcrx: try to coalesce area pages Pavel Begunkov
2025-06-30 17:12 ` [PATCH 0/8] zcrx huge pages support Vol 1 Pavel Begunkov
2025-06-30 17:31   ` Jens Axboe
2025-06-30 17:44     ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox