public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>
Subject: [RFC PATCH v4 10/16] io_uring: add zero copy buf representation and pool
Date: Tue, 12 Mar 2024 14:44:24 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: David Wei <[email protected]>

This patch adds two objects:

* Zero copy buffer representation, holding a page and a net_iov. The
  page is needed as net_iov is designed for opaque device memory,
  whereas we are backed by real pages.
* Zero copy pool, spiritually similar to page pool, that holds ZC bufs
  and hands them out to net devices. This will be used as an
  implementation of page pool memory provider.

Pool regions are registered w/ io_uring using the registered buffer API,
with a 1:1 mapping between region and nr_iovec in
io_uring_register_buffers. This does the heavy lifting of pinning and
chunking into bvecs into a struct io_mapped_ubuf for us.

For now as there is only one pool region per ifq, there is no separate
API for adding/removing regions yet and it is mapped implicitly during
ifq registration.

Signed-off-by: David Wei <[email protected]>
---
 include/linux/io_uring/net.h |   7 +++
 io_uring/zc_rx.c             | 110 +++++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h             |  15 +++++
 3 files changed, 132 insertions(+)

diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
index b58f39fed4d5..05d5a6a97264 100644
--- a/include/linux/io_uring/net.h
+++ b/include/linux/io_uring/net.h
@@ -2,8 +2,15 @@
 #ifndef _LINUX_IO_URING_NET_H
 #define _LINUX_IO_URING_NET_H
 
+#include <net/page_pool/types.h>
+
 struct io_uring_cmd;
 
+struct io_zc_rx_buf {
+	struct net_iov		niov;
+	struct page		*page;
+};
+
 #if defined(CONFIG_IO_URING)
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 77459c0fc14b..326ae3fcc643 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
+#include <linux/nospec.h>
 #include <net/tcp.h>
 #include <net/af_unix.h>
 
@@ -66,6 +67,109 @@ static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq)
 		folio_put(virt_to_folio(ifq->rq_ring));
 }
 
+static int io_zc_rx_init_buf(struct page *page, struct io_zc_rx_buf *buf)
+{
+	memset(&buf->niov, 0, sizeof(buf->niov));
+	atomic_long_set(&buf->niov.pp_ref_count, 0);
+
+	buf->page = page;
+	get_page(page);
+	return 0;
+}
+
+static void io_zc_rx_free_buf(struct io_zc_rx_buf *buf)
+{
+	struct page *page = buf->page;
+
+	put_page(page);
+}
+
+static int io_zc_rx_init_pool(struct io_zc_rx_pool *pool,
+			     struct io_mapped_ubuf *imu)
+{
+	struct io_zc_rx_buf *buf;
+	struct page *page;
+	int i, ret;
+
+	for (i = 0; i < imu->nr_bvecs; i++) {
+		page = imu->bvec[i].bv_page;
+		buf = &pool->bufs[i];
+		ret = io_zc_rx_init_buf(page, buf);
+		if (ret)
+			goto err;
+
+		pool->freelist[i] = i;
+	}
+
+	pool->free_count = imu->nr_bvecs;
+	return 0;
+err:
+	while (i--) {
+		buf = &pool->bufs[i];
+		io_zc_rx_free_buf(buf);
+	}
+	return ret;
+}
+
+static int io_zc_rx_create_pool(struct io_ring_ctx *ctx,
+				struct io_zc_rx_ifq *ifq,
+				u16 id)
+{
+	struct io_mapped_ubuf *imu;
+	struct io_zc_rx_pool *pool;
+	int nr_pages;
+	int ret;
+
+	if (ifq->pool)
+		return -EFAULT;
+
+	if (unlikely(id >= ctx->nr_user_bufs))
+		return -EFAULT;
+	id = array_index_nospec(id, ctx->nr_user_bufs);
+	imu = ctx->user_bufs[id];
+	if (imu->ubuf & ~PAGE_MASK || imu->ubuf_end & ~PAGE_MASK)
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	nr_pages = imu->nr_bvecs;
+	pool = kvmalloc(struct_size(pool, freelist, nr_pages), GFP_KERNEL);
+	if (!pool)
+		goto err;
+
+	pool->bufs = kvmalloc_array(nr_pages, sizeof(*pool->bufs), GFP_KERNEL);
+	if (!pool->bufs)
+		goto err_buf;
+
+	ret = io_zc_rx_init_pool(pool, imu);
+	if (ret)
+		goto err_map;
+
+	pool->ifq = ifq;
+	pool->pool_id = id;
+	pool->nr_bufs = nr_pages;
+	spin_lock_init(&pool->freelist_lock);
+	ifq->pool = pool;
+	return 0;
+err_map:
+	kvfree(pool->bufs);
+err_buf:
+	kvfree(pool);
+err:
+	return ret;
+}
+
+static void io_zc_rx_free_pool(struct io_zc_rx_pool *pool)
+{
+	struct io_zc_rx_buf *buf;
+
+	for (int i = 0; i < pool->nr_bufs; i++) {
+		buf = &pool->bufs[i];
+		io_zc_rx_free_buf(buf);
+	}
+	kvfree(pool->bufs);
+	kvfree(pool);
+}
+
 static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_zc_rx_ifq *ifq;
@@ -104,6 +208,8 @@ static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
 {
 	io_shutdown_ifq(ifq);
 
+	if (ifq->pool)
+		io_zc_rx_free_pool(ifq->pool);
 	if (ifq->dev)
 		dev_put(ifq->dev);
 	io_free_rbuf_ring(ifq);
@@ -141,6 +247,10 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 	if (!ifq->dev)
 		goto err;
 
+	ret = io_zc_rx_create_pool(ctx, ifq, reg.region_id);
+	if (ret)
+		goto err;
+
 	ifq->rq_entries = reg.rq_entries;
 	ifq->if_rxq_id = reg.if_rxq_id;
 
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index d7b8397d525f..466b2b8f9813 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -3,15 +3,30 @@
 #define IOU_ZC_RX_H
 
 #include <linux/io_uring_types.h>
+#include <linux/io_uring/net.h>
 #include <linux/skbuff.h>
 
 #define IO_ZC_MAX_IFQ_SOCKETS		16
 #define IO_ZC_IFQ_IDX_OFFSET		16
 #define IO_ZC_IFQ_IDX_MASK		((1U << IO_ZC_IFQ_IDX_OFFSET) - 1)
 
+struct io_zc_rx_pool {
+	struct io_zc_rx_ifq	*ifq;
+	struct io_zc_rx_buf	*bufs;
+	u32			nr_bufs;
+	u16			pool_id;
+
+	/* freelist */
+	spinlock_t		freelist_lock;
+	u32			free_count;
+	u32			freelist[];
+};
+
 struct io_zc_rx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
+	struct io_zc_rx_pool		*pool;
+
 	struct io_uring			*rq_ring;
 	struct io_uring_rbuf_rqe 	*rqes;
 	u32				rq_entries;
-- 
2.43.0


  parent reply	other threads:[~2024-03-12 21:44 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-12 21:44 [RFC PATCH v4 00/16] Zero copy Rx using io_uring David Wei
2024-03-12 21:44 ` [RFC PATCH v4 01/16] net: generalise pp provider params passing David Wei
2024-03-12 21:44 ` [RFC PATCH v4 02/16] io_uring: delayed cqe commit David Wei
2024-03-12 21:44 ` [RFC PATCH v4 03/16] net: page_pool: add ->scrub mem provider callback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 04/16] io_uring: separate header for exported net bits David Wei
2024-03-12 21:44 ` [RFC PATCH v4 05/16] io_uring: introduce interface queue David Wei
2024-03-12 21:44 ` [RFC PATCH v4 06/16] io_uring: add mmap support for shared ifq ringbuffers David Wei
2024-03-12 21:44 ` [RFC PATCH v4 07/16] netdev: add XDP_SETUP_ZC_RX command David Wei
2024-03-12 21:44 ` [RFC PATCH v4 08/16] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2024-03-12 21:44 ` [RFC PATCH v4 09/16] io_uring/zcrx: implement socket registration David Wei
2024-03-12 21:44 ` David Wei [this message]
2024-03-12 21:44 ` [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx David Wei
2024-03-12 21:44 ` [RFC PATCH v4 12/16] io_uring/zcrx: implement PP_FLAG_DMA_* handling David Wei
2024-03-12 21:44 ` [RFC PATCH v4 13/16] io_uring: add io_recvzc request David Wei
2024-03-13 20:25   ` Jens Axboe
2024-03-13 20:26     ` Pavel Begunkov
2024-03-13 21:03       ` Jens Axboe
2024-03-14 16:14       ` Jens Axboe
2024-03-15 17:34         ` Pavel Begunkov
2024-03-15 18:38           ` Jens Axboe
2024-03-15 23:52             ` Pavel Begunkov
2024-03-16 16:59               ` Jens Axboe
2024-03-17 21:22                 ` Pavel Begunkov
2024-03-17 21:30                   ` Jens Axboe
2024-03-12 21:44 ` [RFC PATCH v4 14/16] net: execute custom callback from napi David Wei
2024-03-12 21:44 ` [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 16/16] veth: add support for io_uring zc rx David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox