public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>
Subject: [RFC PATCH v3 18/20] veth: add support for io_uring zc rx
Date: Tue, 19 Dec 2023 13:03:55 -0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: Pavel Begunkov <[email protected]>

NOT FOR UPSTREAM, TESTING ONLY.

Add io_uring zerocopy support for veth. It's not actually zerocopy, we
copy data in napi, which is early enough in the stack to be useful for
testing.

Note, we'll need some virtual dev support for testing, but that should
not be in the way of real workloads.

Signed-off-by: David Wei <[email protected]>
---
 drivers/net/veth.c | 211 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 205 insertions(+), 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 57efb3454c57..dd00e172979f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -26,6 +26,7 @@
 #include <linux/ptr_ring.h>
 #include <linux/bpf_trace.h>
 #include <linux/net_tstamp.h>
+#include <linux/io_uring/net.h>
 #include <net/page_pool/helpers.h>
 
 #define DRV_NAME	"veth"
@@ -75,6 +76,7 @@ struct veth_priv {
 	struct bpf_prog		*_xdp_prog;
 	struct veth_rq		*rq;
 	unsigned int		requested_headroom;
+	bool			zc_installed;
 };
 
 struct veth_xdp_tx_bq {
@@ -335,9 +337,12 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
 					 const struct net_device *rcv,
 					 const struct sk_buff *skb)
 {
+	struct veth_priv *rcv_priv = netdev_priv(rcv);
+
 	return !(dev->features & NETIF_F_ALL_TSO) ||
 		(skb->destructor == sock_wfree &&
-		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)) ||
+		rcv_priv->zc_installed;
 }
 
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -726,6 +731,9 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
 	struct sk_buff *skb = *pskb;
 	u32 frame_sz;
 
+	if (WARN_ON_ONCE(1))
+		return -EFAULT;
+
 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
 	    skb_shinfo(skb)->nr_frags ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
@@ -827,6 +835,90 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
 	return -ENOMEM;
 }
 
+static noinline struct sk_buff *veth_iou_rcv_skb(struct veth_rq *rq,
+					struct sk_buff *skb)
+{
+	struct sk_buff *nskb;
+	u32 size, len, off, max_head_size;
+	struct page *page;
+	int ret, i, head_off;
+	void *vaddr;
+
+	/* Testing only, randomly send normal pages to test copy fallback */
+	if (ktime_get_ns() % 16 == 0)
+		return skb;
+
+	skb_prepare_for_gro(skb);
+	max_head_size = skb_headlen(skb);
+
+	rcu_read_lock();
+	nskb = napi_alloc_skb(&rq->xdp_napi, max_head_size);
+	if (!nskb)
+		goto drop;
+
+	skb_copy_header(nskb, skb);
+	skb_mark_for_recycle(nskb);
+
+	size = max_head_size;
+	if (skb_copy_bits(skb, 0, nskb->data, size)) {
+	consume_skb(nskb);
+		goto drop;
+	}
+	skb_put(nskb, size);
+	head_off = skb_headroom(nskb) - skb_headroom(skb);
+	skb_headers_offset_update(nskb, head_off);
+
+	/* Allocate paged area of new skb */
+	off = size;
+	len = skb->len - off;
+
+	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+		struct io_zc_rx_buf *buf;
+		void *ppage;
+
+		ppage = page_pool_dev_alloc_pages(rq->page_pool);
+		if (!ppage) {
+			consume_skb(nskb);
+			goto drop;
+		}
+		if (WARN_ON_ONCE(!page_is_page_pool_iov(ppage))) {
+			consume_skb(nskb);
+			goto drop;
+		}
+
+		buf = container_of(page_to_page_pool_iov(ppage),
+				   struct io_zc_rx_buf, ppiov);
+		page = buf->page;
+
+		if (WARN_ON_ONCE(buf->ppiov.pp != rq->page_pool))
+			goto drop;
+
+		size = min_t(u32, len, PAGE_SIZE);
+		skb_add_rx_frag(nskb, i, ppage, 0, size, PAGE_SIZE);
+
+		vaddr = kmap_atomic(page);
+		ret = skb_copy_bits(skb, off, vaddr, size);
+		kunmap_atomic(vaddr);
+
+		if (ret) {
+			consume_skb(nskb);
+			goto drop;
+		}
+		len -= size;
+		off += size;
+	}
+	rcu_read_unlock();
+
+	consume_skb(skb);
+	skb = nskb;
+	return skb;
+drop:
+	rcu_read_unlock();
+	kfree_skb(skb);
+	return NULL;
+}
+
+
 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 					struct sk_buff *skb,
 					struct veth_xdp_tx_bq *bq,
@@ -970,8 +1062,13 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 			/* ndo_start_xmit */
 			struct sk_buff *skb = ptr;
 
-			stats->xdp_bytes += skb->len;
-			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+			if (rq->page_pool->p.memory_provider == PP_MP_IOU_ZCRX) {
+				skb = veth_iou_rcv_skb(rq, skb);
+			} else {
+				stats->xdp_bytes += skb->len;
+				skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+			}
+
 			if (skb) {
 				if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
 					netif_receive_skb(skb);
@@ -1030,15 +1127,21 @@ static int veth_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
-static int veth_create_page_pool(struct veth_rq *rq)
+static int veth_create_page_pool(struct veth_rq *rq, struct io_zc_rx_ifq *ifq)
 {
 	struct page_pool_params pp_params = {
 		.order = 0,
 		.pool_size = VETH_RING_SIZE,
 		.nid = NUMA_NO_NODE,
 		.dev = &rq->dev->dev,
+		.napi = &rq->xdp_napi,
 	};
 
+	if (ifq) {
+		pp_params.mp_priv = ifq;
+		pp_params.memory_provider = PP_MP_IOU_ZCRX;
+	}
+
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
 		int err = PTR_ERR(rq->page_pool);
@@ -1056,7 +1159,7 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
 	int err, i;
 
 	for (i = start; i < end; i++) {
-		err = veth_create_page_pool(&priv->rq[i]);
+		err = veth_create_page_pool(&priv->rq[i], NULL);
 		if (err)
 			goto err_page_pool;
 	}
@@ -1112,9 +1215,17 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
 
 	for (i = start; i < end; i++) {
 		struct veth_rq *rq = &priv->rq[i];
+		void *ptr;
+		int nr = 0;
 
 		rq->rx_notify_masked = false;
-		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
+
+		while ((ptr = ptr_ring_consume(&rq->xdp_ring))) {
+			veth_ptr_free(ptr);
+			nr++;
+		}
+
+		ptr_ring_cleanup(&rq->xdp_ring, NULL);
 	}
 
 	for (i = start; i < end; i++) {
@@ -1350,6 +1461,9 @@ static int veth_set_channels(struct net_device *dev,
 	struct net_device *peer;
 	int err;
 
+	if (priv->zc_installed)
+		return -EINVAL;
+
 	/* sanity check. Upper bounds are already enforced by the caller */
 	if (!ch->rx_count || !ch->tx_count)
 		return -EINVAL;
@@ -1427,6 +1541,8 @@ static int veth_open(struct net_device *dev)
 	struct net_device *peer = rtnl_dereference(priv->peer);
 	int err;
 
+	priv->zc_installed = false;
+
 	if (!peer)
 		return -ENOTCONN;
 
@@ -1604,6 +1720,84 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 	rcu_read_unlock();
 }
 
+static int __veth_iou_set(struct net_device *dev,
+			  struct netdev_bpf *xdp)
+{
+	bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
+	unsigned qid = xdp->zc_rx.queue_id;
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer;
+	struct veth_rq *rq;
+	int ret;
+
+	if (priv->_xdp_prog)
+		return -EINVAL;
+	if (qid >= dev->real_num_rx_queues)
+		return -EINVAL;
+	if (!(dev->flags & IFF_UP))
+		return -EOPNOTSUPP;
+	if (dev->real_num_rx_queues != 1)
+		return -EINVAL;
+	rq = &priv->rq[qid];
+
+	if (!xdp->zc_rx.ifq) {
+		if (!priv->zc_installed)
+			return -EINVAL;
+
+		veth_napi_del(dev);
+		priv->zc_installed = false;
+		if (!veth_gro_requested(dev) && netif_running(dev)) {
+			dev->features &= ~NETIF_F_GRO;
+			netdev_features_change(dev);
+		}
+		return 0;
+	}
+
+	if (priv->zc_installed)
+		return -EINVAL;
+
+	peer = rtnl_dereference(priv->peer);
+	peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+
+	ret = veth_create_page_pool(rq, xdp->zc_rx.ifq);
+	if (ret)
+		return ret;
+
+	ret = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
+	if (ret) {
+		page_pool_destroy(rq->page_pool);
+		rq->page_pool = NULL;
+		return ret;
+	}
+
+	priv->zc_installed = true;
+
+	if (!veth_gro_requested(dev)) {
+		/* user-space did not require GRO, but adding XDP
+		 * is supposed to get GRO working
+		 */
+		dev->features |= NETIF_F_GRO;
+		netdev_features_change(dev);
+	}
+	if (!napi_already_on) {
+		netif_napi_add(dev, &rq->xdp_napi, veth_poll);
+		napi_enable(&rq->xdp_napi);
+		rcu_assign_pointer(rq->napi, &rq->xdp_napi);
+	}
+	return 0;
+}
+
+static int veth_iou_set(struct net_device *dev,
+			struct netdev_bpf *xdp)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = __veth_iou_set(dev, xdp);
+	rtnl_unlock();
+	return ret;
+}
+
 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 			struct netlink_ext_ack *extack)
 {
@@ -1613,6 +1807,9 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	unsigned int max_mtu;
 	int err;
 
+	if (priv->zc_installed)
+		return -EINVAL;
+
 	old_prog = priv->_xdp_prog;
 	priv->_xdp_prog = prog;
 	peer = rtnl_dereference(priv->peer);
@@ -1691,6 +1888,8 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return veth_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_ZC_RX:
+		return veth_iou_set(dev, xdp);
 	default:
 		return -EINVAL;
 	}
-- 
2.39.3


  parent reply	other threads:[~2023-12-19 21:04 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-19 21:03 [RFC PATCH v3 00/20] Zero copy Rx using io_uring David Wei
2023-12-19 21:03 ` [RFC PATCH v3 01/20] net: page_pool: add ppiov mangling helper David Wei
2023-12-19 23:22   ` Mina Almasry
2023-12-19 23:59     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 02/20] tcp: don't allow non-devmem originated ppiov David Wei
2023-12-19 23:24   ` Mina Almasry
2023-12-20  1:29     ` Pavel Begunkov
2024-01-02 16:11       ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 03/20] net: page pool: rework ppiov life cycle David Wei
2023-12-19 23:35   ` Mina Almasry
2023-12-20  0:49     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 04/20] net: enable napi_pp_put_page for ppiov David Wei
2023-12-19 21:03 ` [RFC PATCH v3 05/20] net: page_pool: add ->scrub mem provider callback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 06/20] io_uring: separate header for exported net bits David Wei
2023-12-20 16:01   ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 07/20] io_uring: add interface queue David Wei
2023-12-20 16:13   ` Jens Axboe
2023-12-20 16:23     ` Pavel Begunkov
2023-12-21  1:44     ` David Wei
2023-12-21 17:57   ` Willem de Bruijn
2023-12-30 16:25     ` Pavel Begunkov
2023-12-31 22:25       ` Willem de Bruijn
2023-12-19 21:03 ` [RFC PATCH v3 08/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-12-20 16:13   ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 09/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-12-19 21:03 ` [RFC PATCH v3 10/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-12-20 16:06   ` Jens Axboe
2023-12-20 16:24     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 11/20] io_uring/zcrx: implement socket registration David Wei
2023-12-19 21:03 ` [RFC PATCH v3 12/20] io_uring: add ZC buf and pool David Wei
2023-12-19 21:03 ` [RFC PATCH v3 13/20] io_uring: implement pp memory provider for zc rx David Wei
2023-12-19 23:44   ` Mina Almasry
2023-12-20  0:39     ` Pavel Begunkov
2023-12-21 19:36   ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 14/20] net: page pool: add io_uring memory provider David Wei
2023-12-19 23:39   ` Mina Almasry
2023-12-20  0:04     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 15/20] io_uring: add io_recvzc request David Wei
2023-12-20 16:27   ` Jens Axboe
2023-12-20 17:04     ` Pavel Begunkov
2023-12-20 18:09       ` Jens Axboe
2023-12-21 18:59         ` Pavel Begunkov
2023-12-21 21:32           ` Jens Axboe
2023-12-30 21:15             ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 16/20] net: execute custom callback from napi David Wei
2023-12-19 21:03 ` [RFC PATCH v3 17/20] io_uring/zcrx: add copy fallback David Wei
2023-12-19 21:03 ` David Wei [this message]
2023-12-19 21:03 ` [RFC PATCH v3 19/20] net: page pool: generalise ppiov dma address get David Wei
2023-12-21 19:51   ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 20/20] bnxt: enable io_uring zc page pool David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox