public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Pavel Begunkov <asml.silence@gmail.com>
To: Jakub Kicinski <kuba@kernel.org>, netdev@vger.kernel.org
Cc: asml.silence@gmail.com, io-uring@vger.kernel.org,
	Eric Dumazet <edumazet@google.com>,
	Willem de Bruijn <willemb@google.com>,
	Paolo Abeni <pabeni@redhat.com>,
	andrew+netdev@lunn.ch, horms@kernel.org, davem@davemloft.net,
	sdf@fomichev.me, almasrymina@google.com, dw@davidwei.uk,
	michael.chan@broadcom.com, dtatulea@nvidia.com,
	ap420073@gmail.com
Subject: [RFC v1 22/22] io_uring/zcrx: implement large rx buffer support
Date: Mon, 28 Jul 2025 12:04:26 +0100	[thread overview]
Message-ID: <f6d352a8eb9f0297196fdaf0eccc6d9e2a44a357.1753694914.git.asml.silence@gmail.com> (raw)
In-Reply-To: <cover.1753694913.git.asml.silence@gmail.com>

There are network cards that support receive buffers larger than 4K, and
that can be vastly beneficial for performance, and benchmarks for this
patch showed up to 30% CPU util improvement for 32K vs 4K buffers.

Allows zcrx users to specify the size in struct
io_uring_zcrx_ifq_reg::rx_buf_len. If set to zero, zcrx will use a
default value. zcrx will check and fail if the memory backing the area
can't be split into physically contiguous chunks of the required size.
It's more restrictive as it only needs dma addresses to be contig, but
that's beyond this series.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/uapi/linux/io_uring.h |  2 +-
 io_uring/zcrx.c               | 39 +++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 9d306eb5251c..8e3a342a4ad8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1041,7 +1041,7 @@ struct io_uring_zcrx_ifq_reg {
 
 	struct io_uring_zcrx_offsets offsets;
 	__u32	zcrx_id;
-	__u32	__resv2;
+	__u32	rx_buf_len;
 	__u64	__resv[3];
 };
 
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index a00243e10164..3caa3f472af1 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -13,6 +13,7 @@
 #include <net/page_pool/memory_provider.h>
 #include <net/netlink.h>
 #include <net/netdev_rx_queue.h>
+#include <net/netdev_queues.h>
 #include <net/tcp.h>
 #include <net/rps.h>
 
@@ -53,6 +54,18 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 	return area->mem.pages[net_iov_idx(niov) << niov_pages_shift];
 }
 
+static int io_area_max_shift(struct io_zcrx_mem *mem)
+{
+	struct sg_table *sgt = mem->sgt;
+	struct scatterlist *sg;
+	unsigned order = -1U;
+	unsigned i;
+
+	for_each_sgtable_dma_sg(sgt, sg, i)
+		order = min(order, __ffs(sg->length));
+	return order;
+}
+
 static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
 				struct io_zcrx_area *area)
 {
@@ -384,8 +397,10 @@ static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
 }
 
 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
-			       struct io_uring_zcrx_area_reg *area_reg)
+			       struct io_uring_zcrx_area_reg *area_reg,
+			       struct io_uring_zcrx_ifq_reg *reg)
 {
+	int buf_size_shift = PAGE_SHIFT;
 	struct io_zcrx_area *area;
 	unsigned nr_iovs;
 	int i, ret;
@@ -400,7 +415,16 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 	if (ret)
 		goto err;
 
-	ifq->niov_shift = PAGE_SHIFT;
+	if (reg->rx_buf_len) {
+		if (!is_power_of_2(reg->rx_buf_len) ||
+		     reg->rx_buf_len < PAGE_SIZE)
+			return -EINVAL;
+		buf_size_shift = ilog2(reg->rx_buf_len);
+	}
+	if (buf_size_shift > io_area_max_shift(&area->mem))
+		return -EINVAL;
+
+	ifq->niov_shift = buf_size_shift;
 	nr_iovs = area->mem.size >> ifq->niov_shift;
 	area->nia.num_niovs = nr_iovs;
 
@@ -522,6 +546,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zcrx_ifq_reg __user *arg)
 {
 	struct pp_memory_provider_params mp_param = {};
+	struct netdev_queue_config qcfg = {};
 	struct io_uring_zcrx_area_reg area;
 	struct io_uring_zcrx_ifq_reg reg;
 	struct io_uring_region_desc rd;
@@ -544,8 +569,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
 		return -EFAULT;
-	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
-	    reg.__resv2 || reg.zcrx_id)
+	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
 		return -EINVAL;
 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
 		return -EINVAL;
@@ -589,13 +613,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	}
 	get_device(ifq->dev);
 
-	ret = io_zcrx_create_area(ifq, &area);
+	ret = io_zcrx_create_area(ifq, &area, &reg);
 	if (ret)
 		goto err;
 
 	mp_param.mp_ops = &io_uring_pp_zc_ops;
 	mp_param.mp_priv = ifq;
-	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
+	qcfg.rx_buf_len = 1U << ifq->niov_shift;
+	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, &qcfg);
 	if (ret)
 		goto err;
 	ifq->if_rxq = reg.if_rxq;
@@ -612,6 +637,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			goto err;
 	}
 
+	reg.rx_buf_len = 1U << ifq->niov_shift;
+
 	if (copy_to_user(arg, &reg, sizeof(reg)) ||
 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
 	    copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
-- 
2.49.0


  parent reply	other threads:[~2025-07-28 11:03 UTC|newest]

Thread overview: 66+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-28 11:04 [RFC v1 00/22] Large rx buffer support for zcrx Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 01/22] docs: ethtool: document that rx_buf_len must control payload lengths Pavel Begunkov
2025-07-28 18:11   ` Mina Almasry
2025-07-28 21:36   ` Mina Almasry
2025-08-01 23:13     ` Jakub Kicinski
2025-07-28 11:04 ` [RFC v1 02/22] net: ethtool: report max value for rx-buf-len Pavel Begunkov
2025-07-29  5:00   ` Subbaraya Sundeep
2025-07-28 11:04 ` [RFC v1 03/22] net: use zero value to restore rx_buf_len to default Pavel Begunkov
2025-07-29  5:03   ` Subbaraya Sundeep
2025-07-28 11:04 ` [RFC v1 04/22] net: clarify the meaning of netdev_config members Pavel Begunkov
2025-07-28 21:44   ` Mina Almasry
2025-08-01 23:14     ` Jakub Kicinski
2025-07-28 11:04 ` [RFC v1 05/22] net: add rx_buf_len to netdev config Pavel Begunkov
2025-07-28 21:50   ` Mina Almasry
2025-08-01 23:18     ` Jakub Kicinski
2025-07-28 11:04 ` [RFC v1 06/22] eth: bnxt: read the page size from the adapter struct Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 07/22] eth: bnxt: set page pool page order based on rx_page_size Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 08/22] eth: bnxt: support setting size of agg buffers via ethtool Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 09/22] net: move netdev_config manipulation to dedicated helpers Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 10/22] net: reduce indent of struct netdev_queue_mgmt_ops members Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 11/22] net: allocate per-queue config structs and pass them thru the queue API Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 12/22] net: pass extack to netdev_rx_queue_restart() Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 13/22] net: add queue config validation callback Pavel Begunkov
2025-07-28 22:26   ` Mina Almasry
2025-07-28 11:04 ` [RFC v1 14/22] eth: bnxt: always set the queue mgmt ops Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 15/22] eth: bnxt: store the rx buf size per queue Pavel Begunkov
2025-07-28 22:33   ` Mina Almasry
2025-08-01 23:20     ` Jakub Kicinski
2025-07-28 11:04 ` [RFC v1 16/22] eth: bnxt: adjust the fill level of agg queues with larger buffers Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 17/22] netdev: add support for setting rx-buf-len per queue Pavel Begunkov
2025-07-28 23:10   ` Mina Almasry
2025-08-01 23:37     ` Jakub Kicinski
2025-07-28 11:04 ` [RFC v1 18/22] net: wipe the setting of deactived queues Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 19/22] eth: bnxt: use queue op config validate Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 20/22] eth: bnxt: support per queue configuration of rx-buf-len Pavel Begunkov
2025-07-28 11:04 ` [RFC v1 21/22] net: parametrise mp open with a queue config Pavel Begunkov
2025-08-02  0:10   ` Jakub Kicinski
2025-08-04 12:50     ` Pavel Begunkov
2025-08-05 22:43       ` Jakub Kicinski
2025-08-06  0:05       ` Jakub Kicinski
2025-08-06 16:48       ` Mina Almasry
2025-08-06 18:11         ` Jakub Kicinski
2025-08-06 18:30           ` Mina Almasry
2025-08-06 22:05             ` Jakub Kicinski
2025-07-28 11:04 ` Pavel Begunkov [this message]
2025-07-28 17:13 ` [RFC v1 00/22] Large rx buffer support for zcrx Stanislav Fomichev
2025-07-28 18:18   ` Pavel Begunkov
2025-07-28 20:21     ` Stanislav Fomichev
2025-07-28 21:28       ` Pavel Begunkov
2025-07-28 22:06         ` Stanislav Fomichev
2025-07-28 22:44           ` Pavel Begunkov
2025-07-29 16:33             ` Stanislav Fomichev
2025-07-30 14:16               ` Pavel Begunkov
2025-07-30 15:50                 ` Stanislav Fomichev
2025-07-31 19:34                   ` Mina Almasry
2025-07-31 19:57                     ` Pavel Begunkov
2025-07-31 20:05                       ` Mina Almasry
2025-08-01  9:48                         ` Pavel Begunkov
2025-08-01  9:58                     ` Pavel Begunkov
2025-07-28 23:22           ` Mina Almasry
2025-07-29 16:41             ` Stanislav Fomichev
2025-07-29 17:01               ` Mina Almasry
2025-07-28 18:54 ` Mina Almasry
2025-07-28 19:42   ` Pavel Begunkov
2025-07-28 20:23     ` Mina Almasry
2025-07-28 20:57       ` Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f6d352a8eb9f0297196fdaf0eccc6d9e2a44a357.1753694914.git.asml.silence@gmail.com \
    --to=asml.silence@gmail.com \
    --cc=almasrymina@google.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=ap420073@gmail.com \
    --cc=davem@davemloft.net \
    --cc=dtatulea@nvidia.com \
    --cc=dw@davidwei.uk \
    --cc=edumazet@google.com \
    --cc=horms@kernel.org \
    --cc=io-uring@vger.kernel.org \
    --cc=kuba@kernel.org \
    --cc=michael.chan@broadcom.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=sdf@fomichev.me \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox