public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>
Subject: [RFC PATCH v3 17/20] io_uring/zcrx: add copy fallback
Date: Tue, 19 Dec 2023 13:03:54 -0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: Pavel Begunkov <[email protected]>

Currently, if user fails to keep up with the network and doesn't refill
the buffer ring fast enough the NIC/driver will start dropping packets.
That might be too punishing. Add a fallback path, which would allow
drivers to allocate normal pages when there is starvation, then
zc_rx_recv_skb() we'll detect them and copy into the user specified
buffers, when they become available.

That should help with adoption and also help the user striking the right
balance allocating just the right amount of zerocopy buffers but also
being resilient to sudden surges in traffic.

Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
 io_uring/zc_rx.c | 126 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 120 insertions(+), 6 deletions(-)

diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index acb70ca23150..f7d99d569885 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -6,6 +6,7 @@
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
 #include <linux/nospec.h>
+#include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <trace/events/page_pool.h>
 
@@ -21,6 +22,11 @@ struct io_zc_rx_args {
 	struct socket		*sock;
 };
 
+struct io_zc_refill_data {
+	struct io_zc_rx_ifq *ifq;
+	struct io_zc_rx_buf *buf;
+};
+
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
 static inline u32 io_zc_rx_cqring_entries(struct io_zc_rx_ifq *ifq)
@@ -603,6 +609,39 @@ const struct pp_memory_provider_ops io_uring_pp_zc_ops = {
 };
 EXPORT_SYMBOL(io_uring_pp_zc_ops);
 
+static void io_napi_refill(void *data)
+{
+	struct io_zc_refill_data *rd = data;
+	struct io_zc_rx_ifq *ifq = rd->ifq;
+	void *page;
+
+	if (WARN_ON_ONCE(!ifq->pp))
+		return;
+
+	page = page_pool_dev_alloc_pages(ifq->pp);
+	if (!page)
+		return;
+	if (WARN_ON_ONCE(!page_is_page_pool_iov(page)))
+		return;
+
+	rd->buf = io_iov_to_buf(page_to_page_pool_iov(page));
+}
+
+static struct io_zc_rx_buf *io_zc_get_buf_task_safe(struct io_zc_rx_ifq *ifq)
+{
+	struct io_zc_refill_data rd = {
+		.ifq = ifq,
+	};
+
+	napi_execute(ifq->pp->p.napi, io_napi_refill, &rd);
+	return rd.buf;
+}
+
+static inline void io_zc_return_rbuf_cqe(struct io_zc_rx_ifq *ifq)
+{
+	ifq->cached_cq_tail--;
+}
+
 static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *ifq)
 {
 	struct io_uring_rbuf_cqe *cqe;
@@ -622,6 +661,51 @@ static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *
 	return cqe;
 }
 
+static ssize_t zc_rx_copy_chunk(struct io_zc_rx_ifq *ifq, void *data,
+				unsigned int offset, size_t len,
+				unsigned sock_idx)
+{
+	size_t copy_size, copied = 0;
+	struct io_uring_rbuf_cqe *cqe;
+	struct io_zc_rx_buf *buf;
+	int ret = 0, off = 0;
+	u8 *vaddr;
+
+	do {
+		cqe = io_zc_get_rbuf_cqe(ifq);
+		if (!cqe) {
+			ret = -ENOBUFS;
+			break;
+		}
+		buf = io_zc_get_buf_task_safe(ifq);
+		if (!buf) {
+			io_zc_return_rbuf_cqe(ifq);
+			ret = -ENOMEM;
+			break;
+		}
+
+		vaddr = kmap_local_page(buf->page);
+		copy_size = min_t(size_t, PAGE_SIZE, len);
+		memcpy(vaddr, data + offset, copy_size);
+		kunmap_local(vaddr);
+
+		cqe->region = 0;
+		cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
+		cqe->len = copy_size;
+		cqe->flags = 0;
+		cqe->sock = sock_idx;
+
+		io_zc_rx_get_buf_uref(buf);
+		page_pool_iov_put_many(&buf->ppiov, 1);
+
+		offset += copy_size;
+		len -= copy_size;
+		copied += copy_size;
+	} while (offset < len);
+
+	return copied ? copied : ret;
+}
+
 static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
 			   int off, int len, unsigned sock_idx)
 {
@@ -650,7 +734,22 @@ static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
 		cqe->sock = sock_idx;
 		cqe->flags = 0;
 	} else {
-		return -EOPNOTSUPP;
+		struct page *page = skb_frag_page(frag);
+		u32 p_off, p_len, t, copied = 0;
+		u8 *vaddr;
+		int ret = 0;
+
+		skb_frag_foreach_page(frag, off, len,
+				      page, p_off, p_len, t) {
+			vaddr = kmap_local_page(page);
+			ret = zc_rx_copy_chunk(ifq, vaddr, p_off, p_len, sock_idx);
+			kunmap_local(vaddr);
+
+			if (ret < 0)
+				return copied ? copied : ret;
+			copied += ret;
+		}
+		len = copied;
 	}
 
 	return len;
@@ -665,15 +764,30 @@ zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 	struct socket *sock = args->sock;
 	unsigned sock_idx = sock->zc_rx_idx & IO_ZC_IFQ_IDX_MASK;
 	struct sk_buff *frag_iter;
-	unsigned start, start_off;
+	unsigned start, start_off = offset;
 	int i, copy, end, off;
 	int ret = 0;
 
-	start = skb_headlen(skb);
-	start_off = offset;
+	if (unlikely(offset < skb_headlen(skb))) {
+		ssize_t copied;
+		size_t to_copy;
 
-	if (offset < start)
-		return -EOPNOTSUPP;
+		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
+		copied = zc_rx_copy_chunk(ifq, skb->data, offset, to_copy,
+					  sock_idx);
+		if (copied < 0) {
+			ret = copied;
+			goto out;
+		}
+		offset += copied;
+		len -= copied;
+		if (!len)
+			goto out;
+		if (offset != skb_headlen(skb))
+			goto out;
+	}
+
+	start = skb_headlen(skb);
 
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		const skb_frag_t *frag;
-- 
2.39.3


  parent reply	other threads:[~2023-12-19 21:04 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-19 21:03 [RFC PATCH v3 00/20] Zero copy Rx using io_uring David Wei
2023-12-19 21:03 ` [RFC PATCH v3 01/20] net: page_pool: add ppiov mangling helper David Wei
2023-12-19 23:22   ` Mina Almasry
2023-12-19 23:59     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 02/20] tcp: don't allow non-devmem originated ppiov David Wei
2023-12-19 23:24   ` Mina Almasry
2023-12-20  1:29     ` Pavel Begunkov
2024-01-02 16:11       ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 03/20] net: page pool: rework ppiov life cycle David Wei
2023-12-19 23:35   ` Mina Almasry
2023-12-20  0:49     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 04/20] net: enable napi_pp_put_page for ppiov David Wei
2023-12-19 21:03 ` [RFC PATCH v3 05/20] net: page_pool: add ->scrub mem provider callback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 06/20] io_uring: separate header for exported net bits David Wei
2023-12-20 16:01   ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 07/20] io_uring: add interface queue David Wei
2023-12-20 16:13   ` Jens Axboe
2023-12-20 16:23     ` Pavel Begunkov
2023-12-21  1:44     ` David Wei
2023-12-21 17:57   ` Willem de Bruijn
2023-12-30 16:25     ` Pavel Begunkov
2023-12-31 22:25       ` Willem de Bruijn
2023-12-19 21:03 ` [RFC PATCH v3 08/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-12-20 16:13   ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 09/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-12-19 21:03 ` [RFC PATCH v3 10/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-12-20 16:06   ` Jens Axboe
2023-12-20 16:24     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 11/20] io_uring/zcrx: implement socket registration David Wei
2023-12-19 21:03 ` [RFC PATCH v3 12/20] io_uring: add ZC buf and pool David Wei
2023-12-19 21:03 ` [RFC PATCH v3 13/20] io_uring: implement pp memory provider for zc rx David Wei
2023-12-19 23:44   ` Mina Almasry
2023-12-20  0:39     ` Pavel Begunkov
2023-12-21 19:36   ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 14/20] net: page pool: add io_uring memory provider David Wei
2023-12-19 23:39   ` Mina Almasry
2023-12-20  0:04     ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 15/20] io_uring: add io_recvzc request David Wei
2023-12-20 16:27   ` Jens Axboe
2023-12-20 17:04     ` Pavel Begunkov
2023-12-20 18:09       ` Jens Axboe
2023-12-21 18:59         ` Pavel Begunkov
2023-12-21 21:32           ` Jens Axboe
2023-12-30 21:15             ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 16/20] net: execute custom callback from napi David Wei
2023-12-19 21:03 ` David Wei [this message]
2023-12-19 21:03 ` [RFC PATCH v3 18/20] veth: add support for io_uring zc rx David Wei
2023-12-19 21:03 ` [RFC PATCH v3 19/20] net: page pool: generalise ppiov dma address get David Wei
2023-12-21 19:51   ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 20/20] bnxt: enable io_uring zc page pool David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox