From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>,
Willem de Bruijn <[email protected]>,
Dragos Tatulea <[email protected]>
Subject: [PATCH 18/20] veth: add support for io_uring zc rx
Date: Tue, 7 Nov 2023 13:40:43 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Pavel Begunkov <[email protected]>
Dirty and hacky, testing only
Add io_uring zerocopy support for veth. It's not actually zerocopy, we
copy data in napi, which is early enough in the stack to be useful for
testing.
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
drivers/net/veth.c | 179 ++++++++++++++++++++++++++++++++++++++++++++-
io_uring/zc_rx.c | 15 ++--
2 files changed, 186 insertions(+), 8 deletions(-)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0deefd1573cf..08420d43ac00 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -27,6 +27,8 @@
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
#include <net/page_pool/helpers.h>
+#include <net/netdev_rx_queue.h>
+#include <net/data_pool.h>
#define DRV_NAME "veth"
#define DRV_VERSION "1.0"
@@ -67,6 +69,8 @@ struct veth_rq {
struct ptr_ring xdp_ring;
struct xdp_rxq_info xdp_rxq;
struct page_pool *page_pool;
+
+ struct data_pool zc_dp;
};
struct veth_priv {
@@ -75,6 +79,7 @@ struct veth_priv {
struct bpf_prog *_xdp_prog;
struct veth_rq *rq;
unsigned int requested_headroom;
+ bool zc_installed;
};
struct veth_xdp_tx_bq {
@@ -335,9 +340,12 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
const struct net_device *rcv,
const struct sk_buff *skb)
{
+ struct veth_priv *rcv_priv = netdev_priv(rcv);
+
return !(dev->features & NETIF_F_ALL_TSO) ||
(skb->destructor == sock_wfree &&
- rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+ rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)) ||
+ rcv_priv->zc_installed;
}
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -828,6 +836,73 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
return -ENOMEM;
}
+static struct sk_buff *veth_iou_rcv_skb(struct veth_rq *rq,
+ struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+ u32 size, len, off, max_head_size;
+ struct page *page;
+ int ret, i, head_off;
+ void *vaddr;
+
+ skb_prepare_for_gro(skb);
+ max_head_size = skb_headlen(skb);
+
+ rcu_read_lock();
+ nskb = napi_alloc_skb(&rq->xdp_napi, max_head_size);
+ if (!nskb)
+ goto drop;
+
+ skb_zcopy_init(nskb, rq->zc_dp.zc_uarg);
+ skb_copy_header(nskb, skb);
+ skb_mark_for_recycle(nskb);
+
+ size = max_head_size;
+ if (skb_copy_bits(skb, 0, nskb->data, size)) {
+ consume_skb(nskb);
+ goto drop;
+ }
+ skb_put(nskb, size);
+ head_off = skb_headroom(nskb) - skb_headroom(skb);
+ skb_headers_offset_update(nskb, head_off);
+
+ /* Allocate paged area of new skb */
+ off = size;
+ len = skb->len - off;
+
+ for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+ page = data_pool_alloc_page(&rq->zc_dp);
+ if (!page) {
+ consume_skb(nskb);
+ goto drop;
+ }
+
+ size = min_t(u32, len, PAGE_SIZE);
+ skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
+
+ vaddr = kmap_atomic(page);
+ ret = skb_copy_bits(skb, off, vaddr, size);
+ kunmap_atomic(vaddr);
+
+ if (ret) {
+ consume_skb(nskb);
+ goto drop;
+ }
+ len -= size;
+ off += size;
+ }
+ rcu_read_unlock();
+
+ consume_skb(skb);
+ skb = nskb;
+ return skb;
+drop:
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NULL;
+}
+
+
static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
struct sk_buff *skb,
struct veth_xdp_tx_bq *bq,
@@ -971,8 +1046,13 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
/* ndo_start_xmit */
struct sk_buff *skb = ptr;
- stats->xdp_bytes += skb->len;
- skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+ if (!rq->zc_dp.zc_ifq) {
+ stats->xdp_bytes += skb->len;
+ skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+ } else {
+ skb = veth_iou_rcv_skb(rq, skb);
+ }
+
if (skb) {
if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
netif_receive_skb(skb);
@@ -1351,6 +1431,9 @@ static int veth_set_channels(struct net_device *dev,
struct net_device *peer;
int err;
+ if (priv->zc_installed)
+ return -EINVAL;
+
/* sanity check. Upper bounds are already enforced by the caller */
if (!ch->rx_count || !ch->tx_count)
return -EINVAL;
@@ -1428,6 +1511,8 @@ static int veth_open(struct net_device *dev)
struct net_device *peer = rtnl_dereference(priv->peer);
int err;
+ priv->zc_installed = false;
+
if (!peer)
return -ENOTCONN;
@@ -1618,6 +1703,89 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
rcu_read_unlock();
}
+static int __veth_iou_set(struct net_device *dev,
+ struct netdev_bpf *xdp)
+{
+ bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
+ unsigned qid = xdp->zc_rx.queue_id;
+ struct veth_priv *priv = netdev_priv(dev);
+ struct net_device *peer;
+ struct veth_rq *rq;
+ int ret;
+
+ if (priv->_xdp_prog)
+ return -EINVAL;
+ if (qid >= dev->real_num_rx_queues)
+ return -EINVAL;
+ if (!(dev->flags & IFF_UP))
+ return -EOPNOTSUPP;
+ if (dev->real_num_rx_queues != 1)
+ return -EINVAL;
+
+ rq = &priv->rq[qid];
+ if (!!rq->zc_dp.zc_ifq == !!xdp->zc_rx.ifq)
+ return -EINVAL;
+
+ if (rq->zc_dp.zc_ifq) {
+ veth_napi_del(dev);
+ rq->zc_dp.zc_ifq = NULL;
+ rq->zc_dp.page_pool = NULL;
+ rq->zc_dp.zc_uarg = NULL;
+ priv->zc_installed = false;
+
+ if (!veth_gro_requested(dev) && netif_running(dev)) {
+ dev->features &= ~NETIF_F_GRO;
+ netdev_features_change(dev);
+ }
+ return 0;
+ }
+
+ peer = rtnl_dereference(priv->peer);
+ peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+
+ ret = veth_create_page_pool(rq);
+ if (ret)
+ return ret;
+
+ ret = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
+ if (ret) {
+ page_pool_destroy(rq->page_pool);
+ rq->page_pool = NULL;
+ return ret;
+ }
+
+ rq->zc_dp.zc_ifq = xdp->zc_rx.ifq;
+ rq->zc_dp.zc_uarg = xdp->zc_rx.uarg;
+ rq->zc_dp.page_pool = rq->page_pool;
+ priv->zc_installed = true;
+
+ if (!veth_gro_requested(dev)) {
+ /* user-space did not require GRO, but adding XDP
+ * is supposed to get GRO working
+ */
+ dev->features |= NETIF_F_GRO;
+ netdev_features_change(dev);
+ }
+ if (!napi_already_on) {
+ netif_napi_add(dev, &rq->xdp_napi, veth_poll);
+ napi_enable(&rq->xdp_napi);
+ rcu_assign_pointer(rq->napi, &rq->xdp_napi);
+ }
+ io_zc_rx_set_napi(rq->zc_dp.zc_ifq, rq->xdp_napi.napi_id);
+ return 0;
+}
+
+static int veth_iou_set(struct net_device *dev,
+ struct netdev_bpf *xdp)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = __veth_iou_set(dev, xdp);
+ rtnl_unlock();
+ return ret;
+}
+
static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
@@ -1627,6 +1795,9 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
unsigned int max_mtu;
int err;
+ if (priv->zc_installed)
+ return -EINVAL;
+
old_prog = priv->_xdp_prog;
priv->_xdp_prog = prog;
peer = rtnl_dereference(priv->peer);
@@ -1705,6 +1876,8 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
switch (xdp->command) {
case XDP_SETUP_PROG:
return veth_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_SETUP_ZC_RX:
+ return veth_iou_set(dev, xdp);
default:
return -EINVAL;
}
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 14328024a550..611a068c3402 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -122,11 +122,14 @@ static void io_zc_rx_skb_free(struct sk_buff *skb, struct ubuf_info *uarg,
static int io_zc_rx_map_buf(struct device *dev, struct page *page, u16 pool_id,
u32 pgid, struct io_zc_rx_buf *buf)
{
- dma_addr_t addr;
+ dma_addr_t addr = 0;
SetPagePrivate(page);
set_page_private(page, mk_page_info(pool_id, pgid));
+ if (!dev)
+ goto out;
+
addr = dma_map_page_attrs(dev, page, 0, PAGE_SIZE,
DMA_BIDIRECTIONAL,
DMA_ATTR_SKIP_CPU_SYNC);
@@ -135,7 +138,7 @@ static int io_zc_rx_map_buf(struct device *dev, struct page *page, u16 pool_id,
ClearPagePrivate(page);
return -ENOMEM;
}
-
+out:
buf->dma = addr;
buf->page = page;
atomic_set(&buf->refcount, 0);
@@ -151,9 +154,11 @@ static void io_zc_rx_unmap_buf(struct device *dev, struct io_zc_rx_buf *buf)
page = buf->page;
set_page_private(page, 0);
ClearPagePrivate(page);
- dma_unmap_page_attrs(dev, buf->dma, PAGE_SIZE,
- DMA_BIDIRECTIONAL,
- DMA_ATTR_SKIP_CPU_SYNC);
+
+ if (dev)
+ dma_unmap_page_attrs(dev, buf->dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL,
+ DMA_ATTR_SKIP_CPU_SYNC);
put_page(page);
}
--
2.39.3
next prev parent reply other threads:[~2023-11-07 21:41 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-11-07 21:40 [RFC PATCH v2 00/20] Zero copy Rx using io_uring David Wei
2023-11-07 21:40 ` [PATCH 01/20] io_uring: add interface queue David Wei
2023-11-07 21:40 ` [PATCH 02/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-11-07 21:40 ` [PATCH 03/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-11-07 21:40 ` [PATCH 04/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-11-07 21:40 ` [PATCH 05/20] io_uring/zcrx: implement socket registration David Wei
2023-11-07 21:40 ` [PATCH 06/20] io_uring: add ZC buf and pool David Wei
2023-11-07 21:40 ` [PATCH 07/20] io_uring: add ZC pool API David Wei
2023-11-07 21:40 ` [PATCH 08/20] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() David Wei
2023-11-07 21:40 ` [PATCH 09/20] io_uring: allocate a uarg for freeing zero copy skbs David Wei
2023-11-07 21:40 ` [PATCH 10/20] io_uring: delay ZC pool destruction David Wei
2023-11-07 21:40 ` [PATCH 11/20] net: add data pool David Wei
2023-11-07 21:40 ` [PATCH 12/20] io_uring: add io_recvzc request David Wei
2023-11-07 21:40 ` [PATCH 13/20] io_uring/zcrx: propagate ifq down the stack David Wei
2023-11-07 21:40 ` [PATCH 14/20] io_uring/zcrx: introduce io_zc_get_rbuf_cqe David Wei
2023-11-07 21:40 ` [PATCH 15/20] io_uring/zcrx: add copy fallback David Wei
2023-11-07 21:40 ` [PATCH 16/20] net: execute custom callback from napi David Wei
2023-11-07 21:40 ` [PATCH 17/20] io_uring/zcrx: copy fallback to ring buffers David Wei
2023-11-07 21:40 ` David Wei [this message]
2023-11-07 21:40 ` [PATCH 19/20] bnxt: use data pool David Wei
2023-11-07 21:40 ` [PATCH 20/20] io_uring/zcrx: add multi socket support per Rx queue David Wei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox