public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>
Subject: [RFC PATCH v3 12/20] io_uring: add ZC buf and pool
Date: Tue, 19 Dec 2023 13:03:49 -0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: David Wei <[email protected]>

[TODO: REVIEW COMMIT MESSAGE]

This patch adds two objects:

* Zero copy buffer representation, holding a page, its mapped dma_addr,
  and a refcount for lifetime management.
* Zero copy pool, spiritually similar to page pool, that holds ZC bufs
  and hands them out to net devices.

Pool regions are registered w/ io_uring using the registered buffer API,
with a 1:1 mapping between region and nr_iovec in
io_uring_register_buffers. This does the heavy lifting of pinning and
chunking into bvecs into a struct io_mapped_ubuf for us.

For now as there is only one pool region per ifq, there is no separate
API for adding/removing regions yet and it is mapped implicitly during
ifq registration.

Signed-off-by: David Wei <[email protected]>
---
 include/linux/io_uring/net.h |   8 +++
 io_uring/zc_rx.c             | 135 ++++++++++++++++++++++++++++++++++-
 io_uring/zc_rx.h             |  15 ++++
 3 files changed, 157 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
index b58f39fed4d5..d994d26116d0 100644
--- a/include/linux/io_uring/net.h
+++ b/include/linux/io_uring/net.h
@@ -2,8 +2,16 @@
 #ifndef _LINUX_IO_URING_NET_H
 #define _LINUX_IO_URING_NET_H
 
+#include <net/page_pool/types.h>
+
 struct io_uring_cmd;
 
+struct io_zc_rx_buf {
+	struct page_pool_iov	ppiov;
+	struct page		*page;
+	dma_addr_t		dma;
+};
+
 #if defined(CONFIG_IO_URING)
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 06e2c54d3f3d..1e656b481725 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
+#include <linux/nospec.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -15,6 +16,11 @@
 
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
+static inline struct device *netdev2dev(struct net_device *dev)
+{
+	return dev->dev.parent;
+}
+
 static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq,
 			   u16 queue_id)
 {
@@ -67,6 +73,129 @@ static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq)
 		folio_put(virt_to_folio(ifq->ring));
 }
 
+static int io_zc_rx_init_buf(struct device *dev, struct page *page, u16 pool_id,
+			     u32 pgid, struct io_zc_rx_buf *buf)
+{
+	dma_addr_t addr = 0;
+
+	/* Skip dma setup for devices that don't do any DMA transfers */
+	if (dev) {
+		addr = dma_map_page_attrs(dev, page, 0, PAGE_SIZE,
+					  DMA_BIDIRECTIONAL,
+					  DMA_ATTR_SKIP_CPU_SYNC);
+		if (dma_mapping_error(dev, addr))
+			return -ENOMEM;
+	}
+
+	buf->dma = addr;
+	buf->page = page;
+	refcount_set(&buf->ppiov.refcount, 0);
+	buf->ppiov.owner = NULL;
+	buf->ppiov.pp = NULL;
+	get_page(page);
+	return 0;
+}
+
+static void io_zc_rx_free_buf(struct device *dev, struct io_zc_rx_buf *buf)
+{
+	struct page *page = buf->page;
+
+	if (dev)
+		dma_unmap_page_attrs(dev, buf->dma, PAGE_SIZE,
+				     DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+	put_page(page);
+}
+
+static int io_zc_rx_map_pool(struct io_zc_rx_pool *pool,
+			     struct io_mapped_ubuf *imu,
+			     struct device *dev)
+{
+	struct io_zc_rx_buf *buf;
+	struct page *page;
+	int i, ret;
+
+	for (i = 0; i < imu->nr_bvecs; i++) {
+		page = imu->bvec[i].bv_page;
+		buf = &pool->bufs[i];
+		ret = io_zc_rx_init_buf(dev, page, pool->pool_id, i, buf);
+		if (ret)
+			goto err;
+
+		pool->freelist[i] = i;
+	}
+
+	pool->free_count = imu->nr_bvecs;
+	return 0;
+err:
+	while (i--) {
+		buf = &pool->bufs[i];
+		io_zc_rx_free_buf(dev, buf);
+	}
+	return ret;
+}
+
+static int io_zc_rx_create_pool(struct io_ring_ctx *ctx,
+				struct io_zc_rx_ifq *ifq,
+				u16 id)
+{
+	struct device *dev = netdev2dev(ifq->dev);
+	struct io_mapped_ubuf *imu;
+	struct io_zc_rx_pool *pool;
+	int nr_pages;
+	int ret;
+
+	if (ifq->pool)
+		return -EFAULT;
+
+	if (unlikely(id >= ctx->nr_user_bufs))
+		return -EFAULT;
+	id = array_index_nospec(id, ctx->nr_user_bufs);
+	imu = ctx->user_bufs[id];
+	if (imu->ubuf & ~PAGE_MASK || imu->ubuf_end & ~PAGE_MASK)
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	nr_pages = imu->nr_bvecs;
+	pool = kvmalloc(struct_size(pool, freelist, nr_pages), GFP_KERNEL);
+	if (!pool)
+		goto err;
+
+	pool->bufs = kvmalloc_array(nr_pages, sizeof(*pool->bufs), GFP_KERNEL);
+	if (!pool->bufs)
+		goto err_buf;
+
+	ret = io_zc_rx_map_pool(pool, imu, dev);
+	if (ret)
+		goto err_map;
+
+	pool->ifq = ifq;
+	pool->pool_id = id;
+	pool->nr_bufs = nr_pages;
+	spin_lock_init(&pool->freelist_lock);
+	ifq->pool = pool;
+	return 0;
+err_map:
+	kvfree(pool->bufs);
+err_buf:
+	kvfree(pool);
+err:
+	return ret;
+}
+
+static void io_zc_rx_destroy_pool(struct io_zc_rx_pool *pool)
+{
+	struct device *dev = netdev2dev(pool->ifq->dev);
+	struct io_zc_rx_buf *buf;
+
+	for (int i = 0; i < pool->nr_bufs; i++) {
+		buf = &pool->bufs[i];
+		io_zc_rx_free_buf(dev, buf);
+	}
+	kvfree(pool->bufs);
+	kvfree(pool);
+}
+
 static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_zc_rx_ifq *ifq;
@@ -105,6 +234,8 @@ static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
 {
 	io_shutdown_ifq(ifq);
 
+	if (ifq->pool)
+		io_zc_rx_destroy_pool(ifq->pool);
 	if (ifq->dev)
 		dev_put(ifq->dev);
 	io_free_rbuf_ring(ifq);
@@ -141,7 +272,9 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 	if (!ifq->dev)
 		goto err;
 
-	/* TODO: map zc region and initialise zc pool */
+	ret = io_zc_rx_create_pool(ctx, ifq, reg.region_id);
+	if (ret)
+		goto err;
 
 	ifq->rq_entries = reg.rq_entries;
 	ifq->cq_entries = reg.cq_entries;
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 9257dda77e92..af1d865525d2 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -3,15 +3,30 @@
 #define IOU_ZC_RX_H
 
 #include <linux/io_uring_types.h>
+#include <linux/io_uring/net.h>
 #include <linux/skbuff.h>
 
 #define IO_ZC_MAX_IFQ_SOCKETS		16
 #define IO_ZC_IFQ_IDX_OFFSET		16
 #define IO_ZC_IFQ_IDX_MASK		((1U << IO_ZC_IFQ_IDX_OFFSET) - 1)
 
+struct io_zc_rx_pool {
+	struct io_zc_rx_ifq	*ifq;
+	struct io_zc_rx_buf	*bufs;
+	u32			nr_bufs;
+	u16			pool_id;
+
+	/* freelist */
+	spinlock_t		freelist_lock;
+	u32			free_count;
+	u32			freelist[];
+};
+
 struct io_zc_rx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct net_device		*dev;
+	struct io_zc_rx_pool		*pool;
+
 	struct io_rbuf_ring		*ring;
 	struct io_uring_rbuf_rqe 	*rqes;
 	struct io_uring_rbuf_cqe 	*cqes;
-- 
2.39.3


  parent reply	other threads:[~2023-12-19 21:04 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-19 21:03 [RFC PATCH v3 00/20] Zero copy Rx using io_uring David Wei
2023-12-19 21:03 ` [RFC PATCH v3 01/20] net: page_pool: add ppiov mangling helper David Wei
2023-12-19 23:22   ` Mina Almasry
2023-12-19 23:59     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 02/20] tcp: don't allow non-devmem originated ppiov David Wei
2023-12-19 23:24   ` Mina Almasry
2023-12-20  1:29     ` Pavel Begunkov
2024-01-02 16:11       ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 03/20] net: page pool: rework ppiov life cycle David Wei
2023-12-19 23:35   ` Mina Almasry
2023-12-20  0:49     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 04/20] net: enable napi_pp_put_page for ppiov David Wei
2023-12-19 21:03 ` [RFC PATCH v3 05/20] net: page_pool: add ->scrub mem provider callback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 06/20] io_uring: separate header for exported net bits David Wei
2023-12-20 16:01   ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 07/20] io_uring: add interface queue David Wei
2023-12-20 16:13   ` Jens Axboe
2023-12-20 16:23     ` Pavel Begunkov
2023-12-21  1:44     ` David Wei
2023-12-21 17:57   ` Willem de Bruijn
2023-12-30 16:25     ` Pavel Begunkov
2023-12-31 22:25       ` Willem de Bruijn
2023-12-19 21:03 ` [RFC PATCH v3 08/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-12-20 16:13   ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 09/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-12-19 21:03 ` [RFC PATCH v3 10/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-12-20 16:06   ` Jens Axboe
2023-12-20 16:24     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 11/20] io_uring/zcrx: implement socket registration David Wei
2023-12-19 21:03 ` David Wei [this message]
2023-12-19 21:03 ` [RFC PATCH v3 13/20] io_uring: implement pp memory provider for zc rx David Wei
2023-12-19 23:44   ` Mina Almasry
2023-12-20  0:39     ` Pavel Begunkov
2023-12-21 19:36   ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 14/20] net: page pool: add io_uring memory provider David Wei
2023-12-19 23:39   ` Mina Almasry
2023-12-20  0:04     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 15/20] io_uring: add io_recvzc request David Wei
2023-12-20 16:27   ` Jens Axboe
2023-12-20 17:04     ` Pavel Begunkov
2023-12-20 18:09       ` Jens Axboe
2023-12-21 18:59         ` Pavel Begunkov
2023-12-21 21:32           ` Jens Axboe
2023-12-30 21:15             ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 16/20] net: execute custom callback from napi David Wei
2023-12-19 21:03 ` [RFC PATCH v3 17/20] io_uring/zcrx: add copy fallback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 18/20] veth: add support for io_uring zc rx David Wei
2023-12-19 21:03 ` [RFC PATCH v3 19/20] net: page pool: generalise ppiov dma address get David Wei
2023-12-21 19:51   ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 20/20] bnxt: enable io_uring zc page pool David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox