From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>,
Willem de Bruijn <[email protected]>,
Dragos Tatulea <[email protected]>
Subject: [PATCH 06/20] io_uring: add ZC buf and pool
Date: Tue, 7 Nov 2023 13:40:31 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
This patch adds two objects:
* Zero copy buffer representation, holding a page, its mapped dma_addr,
and a refcount for lifetime management.
* Zero copy pool, spiritually similar to page pool, that holds ZC bufs
and hands them out to net devices.
The ZC pool is tiered with currently two tiers: a fast lockless cache
that should only be accessed from the NAPI context of a single Rx queue,
and a freelist. When a ZC pool region is first mapped, it is added to
the freelist.
During normal operation, bufs are moved from the freelist into the cache
in POOL_CACHE_SIZE blocks before being given out.
Pool regions are registered w/ io_uring using the registered buffer API,
with a 1:1 mapping between region and nr_iovec in
io_uring_register_buffers. This does the heavy lifting of pinning and
chunking into bvecs into a struct io_mapped_ubuf for us.
For now as there is only one pool region per ifq, there is no separate
API for adding/removing regions yet and it is mapped implicitly during
ifq registration.
Co-developed-by: Pavel Begunkov <[email protected]>
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
include/linux/io_uring.h | 6 ++
io_uring/zc_rx.c | 173 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 178 insertions(+), 1 deletion(-)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 106cdc55ff3b..abfb73e257a4 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -41,6 +41,12 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
return sqe->cmd;
}
+struct io_zc_rx_buf {
+ dma_addr_t dma;
+ struct page *page;
+ atomic_t refcount;
+};
+
#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index b5266a67395e..0f5fa9ab5cec 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -5,14 +5,44 @@
#include <linux/mm.h>
#include <linux/io_uring.h>
#include <linux/netdevice.h>
+#include <linux/nospec.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "kbuf.h"
+#include "rsrc.h"
#include "zc_rx.h"
#include "rsrc.h"
+#define POOL_CACHE_SIZE 128
+
+struct io_zc_rx_pool {
+ struct io_zc_rx_ifq *ifq;
+ struct io_zc_rx_buf *bufs;
+ u16 pool_id;
+ u32 nr_pages;
+
+ /* fast cache */
+ u32 cache_count;
+ u32 cache[POOL_CACHE_SIZE];
+
+ /* freelist */
+ spinlock_t freelist_lock;
+ u32 free_count;
+ u32 freelist[];
+};
+
+static inline struct device *netdev2dev(struct net_device *dev)
+{
+ return dev->dev.parent;
+}
+
+static inline u64 mk_page_info(u16 pool_id, u32 pgid)
+{
+ return (u64)0xface << 48 | (u64)pool_id << 32 | (u64)pgid;
+}
+
typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
static int __io_queue_mgmt(struct net_device *dev, struct io_zc_rx_ifq *ifq,
@@ -42,6 +72,143 @@ static int io_close_zc_rxq(struct io_zc_rx_ifq *ifq)
return __io_queue_mgmt(ifq->dev, NULL, ifq->if_rxq_id);
}
+static int io_zc_rx_map_buf(struct device *dev, struct page *page, u16 pool_id,
+ u32 pgid, struct io_zc_rx_buf *buf)
+{
+ dma_addr_t addr;
+
+ SetPagePrivate(page);
+ set_page_private(page, mk_page_info(pool_id, pgid));
+
+ addr = dma_map_page_attrs(dev, page, 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ if (dma_mapping_error(dev, addr)) {
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ return -ENOMEM;
+ }
+
+ buf->dma = addr;
+ buf->page = page;
+ atomic_set(&buf->refcount, 0);
+ get_page(page);
+
+ return 0;
+}
+
+static void io_zc_rx_unmap_buf(struct device *dev, struct io_zc_rx_buf *buf)
+{
+ struct page *page;
+
+ page = buf->page;
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ dma_unmap_page_attrs(dev, buf->dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ put_page(page);
+}
+
+static int io_zc_rx_map_pool(struct io_zc_rx_pool *pool,
+ struct io_mapped_ubuf *imu,
+ struct device *dev)
+{
+ struct io_zc_rx_buf *buf;
+ struct page *page;
+ int i, ret;
+
+ for (i = 0; i < imu->nr_bvecs; i++) {
+ page = imu->bvec[i].bv_page;
+ if (PagePrivate(page)) {
+ ret = -EEXIST;
+ goto err;
+ }
+
+ buf = &pool->bufs[i];
+ ret = io_zc_rx_map_buf(dev, page, pool->pool_id, i, buf);
+ if (ret)
+ goto err;
+
+ pool->freelist[i] = i;
+ }
+
+ return 0;
+err:
+ while (i--) {
+ buf = &pool->bufs[i];
+ io_zc_rx_unmap_buf(dev, buf);
+ }
+
+ return ret;
+}
+
+static int io_zc_rx_create_pool(struct io_ring_ctx *ctx,
+ struct io_zc_rx_ifq *ifq,
+ u16 id)
+{
+ struct device *dev = netdev2dev(ifq->dev);
+ struct io_mapped_ubuf *imu;
+ struct io_zc_rx_pool *pool;
+ int nr_pages;
+ int ret;
+
+ if (ifq->pool)
+ return -EFAULT;
+
+ if (unlikely(id >= ctx->nr_user_bufs))
+ return -EFAULT;
+ id = array_index_nospec(id, ctx->nr_user_bufs);
+ imu = ctx->user_bufs[id];
+ if (imu->ubuf & ~PAGE_MASK || imu->ubuf_end & ~PAGE_MASK)
+ return -EFAULT;
+
+ ret = -ENOMEM;
+ nr_pages = imu->nr_bvecs;
+ pool = kvmalloc(struct_size(pool, freelist, nr_pages), GFP_KERNEL);
+ if (!pool)
+ goto err;
+
+ pool->bufs = kvmalloc_array(nr_pages, sizeof(*pool->bufs), GFP_KERNEL);
+ if (!pool->bufs)
+ goto err_buf;
+
+ ret = io_zc_rx_map_pool(pool, imu, dev);
+ if (ret)
+ goto err_map;
+
+ pool->ifq = ifq;
+ pool->pool_id = id;
+ pool->nr_pages = nr_pages;
+ pool->cache_count = 0;
+ spin_lock_init(&pool->freelist_lock);
+ pool->free_count = nr_pages;
+ ifq->pool = pool;
+
+ return 0;
+
+err_map:
+ kvfree(pool->bufs);
+err_buf:
+ kvfree(pool);
+err:
+ return ret;
+}
+
+static void io_zc_rx_destroy_pool(struct io_zc_rx_pool *pool)
+{
+ struct device *dev = netdev2dev(pool->ifq->dev);
+ struct io_zc_rx_buf *buf;
+
+ for (int i = 0; i < pool->nr_pages; i++) {
+ buf = &pool->bufs[i];
+
+ io_zc_rx_unmap_buf(dev, buf);
+ }
+ kvfree(pool->bufs);
+ kvfree(pool);
+}
+
static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
{
struct io_zc_rx_ifq *ifq;
@@ -60,6 +227,8 @@ static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
{
if (ifq->if_rxq_id != -1)
io_close_zc_rxq(ifq);
+ if (ifq->pool)
+ io_zc_rx_destroy_pool(ifq->pool);
if (ifq->dev)
dev_put(ifq->dev);
io_free_rbuf_ring(ifq);
@@ -94,7 +263,9 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
if (!ifq->dev)
goto err;
- /* TODO: map zc region and initialise zc pool */
+ ret = io_zc_rx_create_pool(ctx, ifq, reg.region_id);
+ if (ret)
+ goto err;
ifq->rq_entries = reg.rq_entries;
ifq->cq_entries = reg.cq_entries;
--
2.39.3
next prev parent reply other threads:[~2023-11-07 21:41 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-11-07 21:40 [RFC PATCH v2 00/20] Zero copy Rx using io_uring David Wei
2023-11-07 21:40 ` [PATCH 01/20] io_uring: add interface queue David Wei
2023-11-07 21:40 ` [PATCH 02/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-11-07 21:40 ` [PATCH 03/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-11-07 21:40 ` [PATCH 04/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-11-07 21:40 ` [PATCH 05/20] io_uring/zcrx: implement socket registration David Wei
2023-11-07 21:40 ` David Wei [this message]
2023-11-07 21:40 ` [PATCH 07/20] io_uring: add ZC pool API David Wei
2023-11-07 21:40 ` [PATCH 08/20] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() David Wei
2023-11-07 21:40 ` [PATCH 09/20] io_uring: allocate a uarg for freeing zero copy skbs David Wei
2023-11-07 21:40 ` [PATCH 10/20] io_uring: delay ZC pool destruction David Wei
2023-11-07 21:40 ` [PATCH 11/20] net: add data pool David Wei
2023-11-07 21:40 ` [PATCH 12/20] io_uring: add io_recvzc request David Wei
2023-11-07 21:40 ` [PATCH 13/20] io_uring/zcrx: propagate ifq down the stack David Wei
2023-11-07 21:40 ` [PATCH 14/20] io_uring/zcrx: introduce io_zc_get_rbuf_cqe David Wei
2023-11-07 21:40 ` [PATCH 15/20] io_uring/zcrx: add copy fallback David Wei
2023-11-07 21:40 ` [PATCH 16/20] net: execute custom callback from napi David Wei
2023-11-07 21:40 ` [PATCH 17/20] io_uring/zcrx: copy fallback to ring buffers David Wei
2023-11-07 21:40 ` [PATCH 18/20] veth: add support for io_uring zc rx David Wei
2023-11-07 21:40 ` [PATCH 19/20] bnxt: use data pool David Wei
2023-11-07 21:40 ` [PATCH 20/20] io_uring/zcrx: add multi socket support per Rx queue David Wei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox