public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>
Subject: [RFC PATCH v4 16/16] veth: add support for io_uring zc rx
Date: Tue, 12 Mar 2024 14:44:30 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: Pavel Begunkov <[email protected]>

Not for upstream, testing only

Add io_uring zerocopy support for veth. It's not truly zerocopy, the
data is copied in napi, but that's early in the stack and so useful
for now for testing.

Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
 drivers/net/veth.c | 214 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 208 insertions(+), 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 500b9dfccd08..b56e06113453 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -26,6 +26,8 @@
 #include <linux/ptr_ring.h>
 #include <linux/bpf_trace.h>
 #include <linux/net_tstamp.h>
+#include <linux/io_uring/net.h>
+#include <net/netdev_rx_queue.h>
 #include <net/page_pool/helpers.h>
 
 #define DRV_NAME	"veth"
@@ -67,6 +69,7 @@ struct veth_rq {
 	struct ptr_ring		xdp_ring;
 	struct xdp_rxq_info	xdp_rxq;
 	struct page_pool	*page_pool;
+	struct netdev_rx_queue	rq;
 };
 
 struct veth_priv {
@@ -75,6 +78,7 @@ struct veth_priv {
 	struct bpf_prog		*_xdp_prog;
 	struct veth_rq		*rq;
 	unsigned int		requested_headroom;
+	bool			zc_installed;
 };
 
 struct veth_xdp_tx_bq {
@@ -335,9 +339,12 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
 					 const struct net_device *rcv,
 					 const struct sk_buff *skb)
 {
+	struct veth_priv *rcv_priv = netdev_priv(rcv);
+
 	return !(dev->features & NETIF_F_ALL_TSO) ||
 		(skb->destructor == sock_wfree &&
-		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)) ||
+		rcv_priv->zc_installed;
 }
 
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -726,6 +733,9 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
 	struct sk_buff *skb = *pskb;
 	u32 frame_sz;
 
+	if (WARN_ON_ONCE(1))
+		return -EFAULT;
+
 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
 	    skb_shinfo(skb)->nr_frags ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
@@ -758,6 +768,90 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
 	return -ENOMEM;
 }
 
+static noinline struct sk_buff *veth_iou_rcv_skb(struct veth_rq *rq,
+					struct sk_buff *skb)
+{
+	struct sk_buff *nskb;
+	u32 size, len, off, max_head_size;
+	struct page *page;
+	int ret, i, head_off;
+	void *vaddr;
+
+	/* Testing only, randomly send normal pages to test copy fallback */
+	if (ktime_get_ns() % 16 == 0)
+		return skb;
+
+	skb_prepare_for_gro(skb);
+	max_head_size = skb_headlen(skb);
+
+	rcu_read_lock();
+	nskb = napi_alloc_skb(&rq->xdp_napi, max_head_size);
+	if (!nskb)
+		goto drop;
+
+	skb_copy_header(nskb, skb);
+	skb_mark_for_recycle(nskb);
+
+	size = max_head_size;
+	if (skb_copy_bits(skb, 0, nskb->data, size)) {
+	consume_skb(nskb);
+		goto drop;
+	}
+	skb_put(nskb, size);
+	head_off = skb_headroom(nskb) - skb_headroom(skb);
+	skb_headers_offset_update(nskb, head_off);
+
+	/* Allocate paged area of new skb */
+	off = size;
+	len = skb->len - off;
+
+	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+		struct io_zc_rx_buf *buf;
+		netmem_ref netmem;
+
+		netmem = page_pool_alloc_netmem(rq->page_pool, GFP_ATOMIC | __GFP_NOWARN);
+		if (!netmem) {
+			consume_skb(nskb);
+			goto drop;
+		}
+		if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
+			consume_skb(nskb);
+			goto drop;
+		}
+
+		buf = container_of(netmem_to_net_iov(netmem),
+				   struct io_zc_rx_buf, niov);
+		page = buf->page;
+
+		if (WARN_ON_ONCE(buf->niov.pp != rq->page_pool))
+			goto drop;
+
+		size = min_t(u32, len, PAGE_SIZE);
+		skb_add_rx_frag_netmem(nskb, i, netmem, 0, size, PAGE_SIZE);
+
+		vaddr = kmap_atomic(page);
+		ret = skb_copy_bits(skb, off, vaddr, size);
+		kunmap_atomic(vaddr);
+
+		if (ret) {
+			consume_skb(nskb);
+			goto drop;
+		}
+		len -= size;
+		off += size;
+	}
+	rcu_read_unlock();
+
+	consume_skb(skb);
+	skb = nskb;
+	return skb;
+drop:
+	rcu_read_unlock();
+	kfree_skb(skb);
+	return NULL;
+}
+
+
 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
 					struct sk_buff *skb,
 					struct veth_xdp_tx_bq *bq,
@@ -901,8 +995,13 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
 			/* ndo_start_xmit */
 			struct sk_buff *skb = ptr;
 
-			stats->xdp_bytes += skb->len;
-			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+			if (rq->page_pool->mp_ops == &io_uring_pp_zc_ops) {
+				skb = veth_iou_rcv_skb(rq, skb);
+			} else {
+				stats->xdp_bytes += skb->len;
+				skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
+			}
+
 			if (skb) {
 				if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
 					netif_receive_skb(skb);
@@ -961,15 +1060,22 @@ static int veth_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
-static int veth_create_page_pool(struct veth_rq *rq)
+static int veth_create_page_pool(struct veth_rq *rq, struct io_zc_rx_ifq *ifq)
 {
 	struct page_pool_params pp_params = {
 		.order = 0,
 		.pool_size = VETH_RING_SIZE,
 		.nid = NUMA_NO_NODE,
 		.dev = &rq->dev->dev,
+		.napi = &rq->xdp_napi,
 	};
 
+	if (ifq) {
+		rq->rq.pp_private = ifq;
+		rq->rq.pp_ops = &io_uring_pp_zc_ops;
+		pp_params.queue = &rq->rq;
+	}
+
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
 		int err = PTR_ERR(rq->page_pool);
@@ -987,7 +1093,7 @@ static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
 	int err, i;
 
 	for (i = start; i < end; i++) {
-		err = veth_create_page_pool(&priv->rq[i]);
+		err = veth_create_page_pool(&priv->rq[i], NULL);
 		if (err)
 			goto err_page_pool;
 	}
@@ -1043,9 +1149,17 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
 
 	for (i = start; i < end; i++) {
 		struct veth_rq *rq = &priv->rq[i];
+		void *ptr;
+		int nr = 0;
 
 		rq->rx_notify_masked = false;
-		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
+
+		while ((ptr = ptr_ring_consume(&rq->xdp_ring))) {
+			veth_ptr_free(ptr);
+			nr++;
+		}
+
+		ptr_ring_cleanup(&rq->xdp_ring, NULL);
 	}
 
 	for (i = start; i < end; i++) {
@@ -1281,6 +1395,9 @@ static int veth_set_channels(struct net_device *dev,
 	struct net_device *peer;
 	int err;
 
+	if (priv->zc_installed)
+		return -EINVAL;
+
 	/* sanity check. Upper bounds are already enforced by the caller */
 	if (!ch->rx_count || !ch->tx_count)
 		return -EINVAL;
@@ -1358,6 +1475,8 @@ static int veth_open(struct net_device *dev)
 	struct net_device *peer = rtnl_dereference(priv->peer);
 	int err;
 
+	priv->zc_installed = false;
+
 	if (!peer)
 		return -ENOTCONN;
 
@@ -1536,6 +1655,84 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 	rcu_read_unlock();
 }
 
+static int __veth_iou_set(struct net_device *dev,
+			  struct netdev_bpf *xdp)
+{
+	bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
+	unsigned qid = xdp->zc_rx.queue_id;
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer;
+	struct veth_rq *rq;
+	int ret;
+
+	if (priv->_xdp_prog)
+		return -EINVAL;
+	if (qid >= dev->real_num_rx_queues)
+		return -EINVAL;
+	if (!(dev->flags & IFF_UP))
+		return -EOPNOTSUPP;
+	if (dev->real_num_rx_queues != 1)
+		return -EINVAL;
+	rq = &priv->rq[qid];
+
+	if (!xdp->zc_rx.ifq) {
+		if (!priv->zc_installed)
+			return -EINVAL;
+
+		veth_napi_del(dev);
+		priv->zc_installed = false;
+		if (!veth_gro_requested(dev) && netif_running(dev)) {
+			dev->features &= ~NETIF_F_GRO;
+			netdev_features_change(dev);
+		}
+		return 0;
+	}
+
+	if (priv->zc_installed)
+		return -EINVAL;
+
+	peer = rtnl_dereference(priv->peer);
+	peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+
+	ret = veth_create_page_pool(rq, xdp->zc_rx.ifq);
+	if (ret)
+		return ret;
+
+	ret = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
+	if (ret) {
+		page_pool_destroy(rq->page_pool);
+		rq->page_pool = NULL;
+		return ret;
+	}
+
+	priv->zc_installed = true;
+
+	if (!veth_gro_requested(dev)) {
+		/* user-space did not require GRO, but adding XDP
+		 * is supposed to get GRO working
+		 */
+		dev->features |= NETIF_F_GRO;
+		netdev_features_change(dev);
+	}
+	if (!napi_already_on) {
+		netif_napi_add(dev, &rq->xdp_napi, veth_poll);
+		napi_enable(&rq->xdp_napi);
+		rcu_assign_pointer(rq->napi, &rq->xdp_napi);
+	}
+	return 0;
+}
+
+static int veth_iou_set(struct net_device *dev,
+			struct netdev_bpf *xdp)
+{
+	int ret;
+
+	rtnl_lock();
+	ret = __veth_iou_set(dev, xdp);
+	rtnl_unlock();
+	return ret;
+}
+
 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 			struct netlink_ext_ack *extack)
 {
@@ -1545,6 +1742,9 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	unsigned int max_mtu;
 	int err;
 
+	if (priv->zc_installed)
+		return -EINVAL;
+
 	old_prog = priv->_xdp_prog;
 	priv->_xdp_prog = prog;
 	peer = rtnl_dereference(priv->peer);
@@ -1623,6 +1823,8 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return veth_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_SETUP_ZC_RX:
+		return veth_iou_set(dev, xdp);
 	default:
 		return -EINVAL;
 	}
-- 
2.43.0


      parent reply	other threads:[~2024-03-12 21:44 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-12 21:44 [RFC PATCH v4 00/16] Zero copy Rx using io_uring David Wei
2024-03-12 21:44 ` [RFC PATCH v4 01/16] net: generalise pp provider params passing David Wei
2024-03-12 21:44 ` [RFC PATCH v4 02/16] io_uring: delayed cqe commit David Wei
2024-03-12 21:44 ` [RFC PATCH v4 03/16] net: page_pool: add ->scrub mem provider callback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 04/16] io_uring: separate header for exported net bits David Wei
2024-03-12 21:44 ` [RFC PATCH v4 05/16] io_uring: introduce interface queue David Wei
2024-03-12 21:44 ` [RFC PATCH v4 06/16] io_uring: add mmap support for shared ifq ringbuffers David Wei
2024-03-12 21:44 ` [RFC PATCH v4 07/16] netdev: add XDP_SETUP_ZC_RX command David Wei
2024-03-12 21:44 ` [RFC PATCH v4 08/16] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2024-03-12 21:44 ` [RFC PATCH v4 09/16] io_uring/zcrx: implement socket registration David Wei
2024-03-12 21:44 ` [RFC PATCH v4 10/16] io_uring: add zero copy buf representation and pool David Wei
2024-03-12 21:44 ` [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx David Wei
2024-03-12 21:44 ` [RFC PATCH v4 12/16] io_uring/zcrx: implement PP_FLAG_DMA_* handling David Wei
2024-03-12 21:44 ` [RFC PATCH v4 13/16] io_uring: add io_recvzc request David Wei
2024-03-13 20:25   ` Jens Axboe
2024-03-13 20:26     ` Pavel Begunkov
2024-03-13 21:03       ` Jens Axboe
2024-03-14 16:14       ` Jens Axboe
2024-03-15 17:34         ` Pavel Begunkov
2024-03-15 18:38           ` Jens Axboe
2024-03-15 23:52             ` Pavel Begunkov
2024-03-16 16:59               ` Jens Axboe
2024-03-17 21:22                 ` Pavel Begunkov
2024-03-17 21:30                   ` Jens Axboe
2024-03-12 21:44 ` [RFC PATCH v4 14/16] net: execute custom callback from napi David Wei
2024-03-12 21:44 ` [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback David Wei
2024-03-12 21:44 ` David Wei [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox