From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>
Subject: [RFC PATCH v4 16/16] veth: add support for io_uring zc rx
Date: Tue, 12 Mar 2024 14:44:30 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Pavel Begunkov <[email protected]>
Not for upstream, testing only
Add io_uring zerocopy support for veth. It's not truly zerocopy, the
data is copied in napi, but that's early in the stack and so useful
for now for testing.
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
drivers/net/veth.c | 214 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 208 insertions(+), 6 deletions(-)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 500b9dfccd08..b56e06113453 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -26,6 +26,8 @@
#include <linux/ptr_ring.h>
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
+#include <linux/io_uring/net.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#define DRV_NAME "veth"
@@ -67,6 +69,7 @@ struct veth_rq {
struct ptr_ring xdp_ring;
struct xdp_rxq_info xdp_rxq;
struct page_pool *page_pool;
+ struct netdev_rx_queue rq;
};
struct veth_priv {
@@ -75,6 +78,7 @@ struct veth_priv {
struct bpf_prog *_xdp_prog;
struct veth_rq *rq;
unsigned int requested_headroom;
+ bool zc_installed;
};
struct veth_xdp_tx_bq {
@@ -335,9 +339,12 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
const struct net_device *rcv,
const struct sk_buff *skb)
{
+ struct veth_priv *rcv_priv = netdev_priv(rcv);
+
return !(dev->features & NETIF_F_ALL_TSO) ||
(skb->destructor == sock_wfree &&
- rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+ rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)) ||
+ rcv_priv->zc_installed;
}
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -726,6 +733,9 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
struct sk_buff *skb = *pskb;
u32 frame_sz;
+ if (WARN_ON_ONCE(1))
+ return -EFAULT;
+
if (skb_shared(skb) || skb_head_is_locked(skb) ||
skb_shinfo(skb)->nr_frags ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
@@ -758,6 +768,90 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
return -ENOMEM;
}
+static noinline struct sk_buff *veth_iou_rcv_skb(struct veth_rq *rq,
+ struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+ u32 size, len, off, max_head_size;
+ struct page *page;
+ int ret, i, head_off;
+ void *vaddr;
+
+ /* Testing only, randomly send normal pages to test copy fallback */
+ if (ktime_get_ns() % 16 == 0)
+ return skb;
+
+ skb_prepare_for_gro(skb);
+ max_head_size = skb_headlen(skb);
+
+ rcu_read_lock();
+ nskb = napi_alloc_skb(&rq->xdp_napi, max_head_size);
+ if (!nskb)
+ goto drop;
+
+ skb_copy_header(nskb, skb);
+ skb_mark_for_recycle(nskb);
+
+ size = max_head_size;
+ if (skb_copy_bits(skb, 0, nskb->data, size)) {
+ consume_skb(nskb);
+ goto drop;
+ }
+ skb_put(nskb, size);
+ head_off = skb_headroom(nskb) - skb_headroom(skb);
+ skb_headers_offset_update(nskb, head_off);
+
+ /* Allocate paged area of new skb */
+ off = size;
+ len = skb->len - off;
+
+ for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+ struct io_zc_rx_buf *buf;
+ netmem_ref netmem;
+
+ netmem = page_pool_alloc_netmem(rq->page_pool, GFP_ATOMIC | __GFP_NOWARN);
+ if (!netmem) {
+ consume_skb(nskb);
+ goto drop;
+ }
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
+ consume_skb(nskb);
+ goto drop;
+ }
+
+ buf = container_of(netmem_to_net_iov(netmem),
+ struct io_zc_rx_buf, niov);
+ page = buf->page;
+
+ if (WARN_ON_ONCE(buf->niov.pp != rq->page_pool))
+ goto drop;
+
+ size = min_t(u32, len, PAGE_SIZE);
+ skb_add_rx_frag_netmem(nskb, i, netmem, 0, size, PAGE_SIZE);
+
+ vaddr = kmap_atomic(page);
+ ret = skb_copy_bits(skb, off, vaddr, size);
+ kunmap_atomic(vaddr);
+
+ if (ret) {
+ consume_skb(nskb);
+ goto drop;
+ }
+ len -= size;
+ off += size;
+ }
+ rcu_read_unlock();
+
+ consume_skb(skb);
+ skb = nskb;
+ return skb;
+drop:
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NULL;
+}
+
+
static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
struct sk_buff *skb,
struct veth_xdp_tx_bq *bq,
@@ -901,8 +995,13 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
/* ndo_start_xmit */
struct sk_buff *skb = ptr;
- stats->xdp_bytes += skb->len;
- skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+ if (rq->page_pool->mp_ops == &io_uring_pp_zc_ops) {
+ skb = veth_iou_rcv_skb(rq, skb);
+ } else {
+ stats->xdp_bytes += skb->len;
+ skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+ }
+
if (skb) {
if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
netif_receive_skb(skb);
@@ -961,15 +1060,22 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}
-static int veth_create_page_pool(struct veth_rq *rq)
+static int veth_create_page_pool(struct veth_rq *rq, struct io_zc_rx_ifq *ifq)
{
struct page_pool_params pp_params = {
.order = 0,
.pool_size = VETH_RING_SIZE,
.nid = NUMA_NO_NODE,
.dev = &rq->dev->dev,
+ .napi = &rq->xdp_napi,
};
+ if (ifq) {
+ rq->rq.pp_private = ifq;
+ rq->rq.pp_ops = &io_uring_pp_zc_ops;
+ pp_params.queue = &rq->rq;
+ }
+
rq->page_pool = page_pool_create(&pp_params);
if (IS_ERR(rq->page_pool)) {
int err = PTR_ERR(rq->page_pool);
@@ -987,7 +1093,7 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
int err, i;
for (i = start; i < end; i++) {
- err = veth_create_page_pool(&priv->rq[i]);
+ err = veth_create_page_pool(&priv->rq[i], NULL);
if (err)
goto err_page_pool;
}
@@ -1043,9 +1149,17 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
for (i = start; i < end; i++) {
struct veth_rq *rq = &priv->rq[i];
+ void *ptr;
+ int nr = 0;
rq->rx_notify_masked = false;
- ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
+
+ while ((ptr = ptr_ring_consume(&rq->xdp_ring))) {
+ veth_ptr_free(ptr);
+ nr++;
+ }
+
+ ptr_ring_cleanup(&rq->xdp_ring, NULL);
}
for (i = start; i < end; i++) {
@@ -1281,6 +1395,9 @@ static int veth_set_channels(struct net_device *dev,
struct net_device *peer;
int err;
+ if (priv->zc_installed)
+ return -EINVAL;
+
/* sanity check. Upper bounds are already enforced by the caller */
if (!ch->rx_count || !ch->tx_count)
return -EINVAL;
@@ -1358,6 +1475,8 @@ static int veth_open(struct net_device *dev)
struct net_device *peer = rtnl_dereference(priv->peer);
int err;
+ priv->zc_installed = false;
+
if (!peer)
return -ENOTCONN;
@@ -1536,6 +1655,84 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
rcu_read_unlock();
}
+static int __veth_iou_set(struct net_device *dev,
+ struct netdev_bpf *xdp)
+{
+ bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
+ unsigned qid = xdp->zc_rx.queue_id;
+ struct veth_priv *priv = netdev_priv(dev);
+ struct net_device *peer;
+ struct veth_rq *rq;
+ int ret;
+
+ if (priv->_xdp_prog)
+ return -EINVAL;
+ if (qid >= dev->real_num_rx_queues)
+ return -EINVAL;
+ if (!(dev->flags & IFF_UP))
+ return -EOPNOTSUPP;
+ if (dev->real_num_rx_queues != 1)
+ return -EINVAL;
+ rq = &priv->rq[qid];
+
+ if (!xdp->zc_rx.ifq) {
+ if (!priv->zc_installed)
+ return -EINVAL;
+
+ veth_napi_del(dev);
+ priv->zc_installed = false;
+ if (!veth_gro_requested(dev) && netif_running(dev)) {
+ dev->features &= ~NETIF_F_GRO;
+ netdev_features_change(dev);
+ }
+ return 0;
+ }
+
+ if (priv->zc_installed)
+ return -EINVAL;
+
+ peer = rtnl_dereference(priv->peer);
+ peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+
+ ret = veth_create_page_pool(rq, xdp->zc_rx.ifq);
+ if (ret)
+ return ret;
+
+ ret = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
+ if (ret) {
+ page_pool_destroy(rq->page_pool);
+ rq->page_pool = NULL;
+ return ret;
+ }
+
+ priv->zc_installed = true;
+
+ if (!veth_gro_requested(dev)) {
+ /* user-space did not require GRO, but adding XDP
+ * is supposed to get GRO working
+ */
+ dev->features |= NETIF_F_GRO;
+ netdev_features_change(dev);
+ }
+ if (!napi_already_on) {
+ netif_napi_add(dev, &rq->xdp_napi, veth_poll);
+ napi_enable(&rq->xdp_napi);
+ rcu_assign_pointer(rq->napi, &rq->xdp_napi);
+ }
+ return 0;
+}
+
+static int veth_iou_set(struct net_device *dev,
+ struct netdev_bpf *xdp)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = __veth_iou_set(dev, xdp);
+ rtnl_unlock();
+ return ret;
+}
+
static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
@@ -1545,6 +1742,9 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
unsigned int max_mtu;
int err;
+ if (priv->zc_installed)
+ return -EINVAL;
+
old_prog = priv->_xdp_prog;
priv->_xdp_prog = prog;
peer = rtnl_dereference(priv->peer);
@@ -1623,6 +1823,8 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
switch (xdp->command) {
case XDP_SETUP_PROG:
return veth_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_SETUP_ZC_RX:
+ return veth_iou_set(dev, xdp);
default:
return -EINVAL;
}
--
2.43.0
prev parent reply other threads:[~2024-03-12 21:44 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-12 21:44 [RFC PATCH v4 00/16] Zero copy Rx using io_uring David Wei
2024-03-12 21:44 ` [RFC PATCH v4 01/16] net: generalise pp provider params passing David Wei
2024-03-12 21:44 ` [RFC PATCH v4 02/16] io_uring: delayed cqe commit David Wei
2024-03-12 21:44 ` [RFC PATCH v4 03/16] net: page_pool: add ->scrub mem provider callback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 04/16] io_uring: separate header for exported net bits David Wei
2024-03-12 21:44 ` [RFC PATCH v4 05/16] io_uring: introduce interface queue David Wei
2024-03-12 21:44 ` [RFC PATCH v4 06/16] io_uring: add mmap support for shared ifq ringbuffers David Wei
2024-03-12 21:44 ` [RFC PATCH v4 07/16] netdev: add XDP_SETUP_ZC_RX command David Wei
2024-03-12 21:44 ` [RFC PATCH v4 08/16] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2024-03-12 21:44 ` [RFC PATCH v4 09/16] io_uring/zcrx: implement socket registration David Wei
2024-03-12 21:44 ` [RFC PATCH v4 10/16] io_uring: add zero copy buf representation and pool David Wei
2024-03-12 21:44 ` [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx David Wei
2024-03-12 21:44 ` [RFC PATCH v4 12/16] io_uring/zcrx: implement PP_FLAG_DMA_* handling David Wei
2024-03-12 21:44 ` [RFC PATCH v4 13/16] io_uring: add io_recvzc request David Wei
2024-03-13 20:25 ` Jens Axboe
2024-03-13 20:26 ` Pavel Begunkov
2024-03-13 21:03 ` Jens Axboe
2024-03-14 16:14 ` Jens Axboe
2024-03-15 17:34 ` Pavel Begunkov
2024-03-15 18:38 ` Jens Axboe
2024-03-15 23:52 ` Pavel Begunkov
2024-03-16 16:59 ` Jens Axboe
2024-03-17 21:22 ` Pavel Begunkov
2024-03-17 21:30 ` Jens Axboe
2024-03-12 21:44 ` [RFC PATCH v4 14/16] net: execute custom callback from napi David Wei
2024-03-12 21:44 ` [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback David Wei
2024-03-12 21:44 ` David Wei [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox