From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>,
Stanislav Fomichev <[email protected]>,
Joe Damato <[email protected]>,
Pedro Tammela <[email protected]>
Subject: [PATCH v7 14/15] io_uring/zcrx: add copy fallback
Date: Tue, 29 Oct 2024 16:05:17 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Pavel Begunkov <[email protected]>
There are scenarios in which the zerocopy path might get a normal
in-kernel buffer, it could be a mis-steered packet or simply the linear
part of an skb. Another use case is to allow the driver to allocate
kernel pages when it's out of zc buffers, which makes it more resilient
to spikes in load and allow the user to choose the balance between the
amount of memory provided and performance.
At the moment we fail such requests. Instead, grab a buffer from the
page pool, copy data there, and return back to user in the usual way.
Because the refill ring is private to the napi our page pool is running
from, it's done by stopping the napi via napi_execute() helper. It grabs
only one buffer, which is inefficient, and improving it is left for
follow up patches.
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
io_uring/zcrx.c | 133 +++++++++++++++++++++++++++++++++++++++++++++---
io_uring/zcrx.h | 1 +
2 files changed, 127 insertions(+), 7 deletions(-)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 3f4625730dbd..1f4db70e3370 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -5,6 +5,8 @@
#include <linux/nospec.h>
#include <linux/netdevice.h>
#include <linux/io_uring.h>
+#include <linux/skbuff_ref.h>
+#include <net/busy_poll.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <trace/events/page_pool.h>
@@ -28,6 +30,11 @@ struct io_zcrx_args {
struct socket *sock;
};
+struct io_zc_refill_data {
+ struct io_zcrx_ifq *ifq;
+ struct net_iov *niov;
+};
+
static const struct memory_provider_ops io_uring_pp_zc_ops;
static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
@@ -37,6 +44,13 @@ static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *nio
return container_of(owner, struct io_zcrx_area, nia);
}
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ return area->pages[net_iov_idx(niov)];
+}
+
static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx)
{
struct netdev_rx_queue *rxq;
@@ -59,6 +73,13 @@ static int io_open_zc_rxq(struct io_zcrx_ifq *ifq, unsigned ifq_idx)
ret = netdev_rx_queue_restart(ifq->dev, ifq->if_rxq);
if (ret)
goto fail;
+
+ if (WARN_ON_ONCE(!ifq->pp)) {
+ ret = -EFAULT;
+ goto fail;
+ }
+ /* grab napi_id while still under rtnl */
+ ifq->napi_id = ifq->pp->p.napi->napi_id;
return 0;
fail:
rxq->mp_params.mp_ops = NULL;
@@ -526,6 +547,7 @@ static void io_pp_zc_destroy(struct page_pool *pp)
page_pool_mp_release_area(pp, &ifq->area->nia);
ifq->pp = NULL;
+ ifq->napi_id = 0;
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
return;
@@ -540,6 +562,34 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
.scrub = io_pp_zc_scrub,
};
+static void io_napi_refill(void *data)
+{
+ struct io_zc_refill_data *rd = data;
+ struct io_zcrx_ifq *ifq = rd->ifq;
+ netmem_ref netmem;
+
+ if (WARN_ON_ONCE(!ifq->pp))
+ return;
+
+ netmem = page_pool_alloc_netmem(ifq->pp, GFP_ATOMIC | __GFP_NOWARN);
+ if (!netmem)
+ return;
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return;
+
+ rd->niov = netmem_to_net_iov(netmem);
+}
+
+static struct net_iov *io_zc_get_buf_task_safe(struct io_zcrx_ifq *ifq)
+{
+ struct io_zc_refill_data rd = {
+ .ifq = ifq,
+ };
+
+ napi_execute(ifq->napi_id, io_napi_refill, &rd);
+ return rd.niov;
+}
+
static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
struct io_zcrx_ifq *ifq, int off, int len)
{
@@ -563,6 +613,45 @@ static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
return true;
}
+static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
+ void *data, unsigned int offset, size_t len)
+{
+ size_t copy_size, copied = 0;
+ int ret = 0, off = 0;
+ struct page *page;
+ u8 *vaddr;
+
+ do {
+ struct net_iov *niov;
+
+ niov = io_zc_get_buf_task_safe(ifq);
+ if (!niov) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ page = io_zcrx_iov_page(niov);
+ vaddr = kmap_local_page(page);
+ copy_size = min_t(size_t, PAGE_SIZE, len);
+ memcpy(vaddr, data + offset, copy_size);
+ kunmap_local(vaddr);
+
+ if (!io_zcrx_queue_cqe(req, niov, ifq, off, copy_size)) {
+ napi_pp_put_page(net_iov_to_netmem(niov));
+ return -ENOSPC;
+ }
+
+ io_zcrx_get_buf_uref(niov);
+ napi_pp_put_page(net_iov_to_netmem(niov));
+
+ offset += copy_size;
+ len -= copy_size;
+ copied += copy_size;
+ } while (offset < len);
+
+ return copied ? copied : ret;
+}
+
static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
const skb_frag_t *frag, int off, int len)
{
@@ -570,8 +659,24 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
off += skb_frag_off(frag);
- if (unlikely(!skb_frag_is_net_iov(frag)))
- return -EOPNOTSUPP;
+ if (unlikely(!skb_frag_is_net_iov(frag))) {
+ struct page *page = skb_frag_page(frag);
+ u32 p_off, p_len, t, copied = 0;
+ u8 *vaddr;
+ int ret = 0;
+
+ skb_frag_foreach_page(frag, off, len,
+ page, p_off, p_len, t) {
+ vaddr = kmap_local_page(page);
+ ret = io_zcrx_copy_chunk(req, ifq, vaddr, p_off, p_len);
+ kunmap_local(vaddr);
+
+ if (ret < 0)
+ return copied ? copied : ret;
+ copied += ret;
+ }
+ return copied;
+ }
niov = netmem_to_net_iov(frag->netmem);
if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
@@ -592,15 +697,29 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
struct io_zcrx_ifq *ifq = args->ifq;
struct io_kiocb *req = args->req;
struct sk_buff *frag_iter;
- unsigned start, start_off;
+ unsigned start, start_off = offset;
int i, copy, end, off;
int ret = 0;
- start = skb_headlen(skb);
- start_off = offset;
+ if (unlikely(offset < skb_headlen(skb))) {
+ ssize_t copied;
+ size_t to_copy;
- if (offset < start)
- return -EOPNOTSUPP;
+ to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
+ copied = io_zcrx_copy_chunk(req, ifq, skb->data, offset, to_copy);
+ if (copied < 0) {
+ ret = copied;
+ goto out;
+ }
+ offset += copied;
+ len -= copied;
+ if (!len)
+ goto out;
+ if (offset != skb_headlen(skb))
+ goto out;
+ }
+
+ start = skb_headlen(skb);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
const skb_frag_t *frag;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index d3f6b6cdd647..5d7920972e95 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -39,6 +39,7 @@ struct io_zcrx_ifq {
u32 if_rxq;
netdevice_tracker netdev_tracker;
+ unsigned napi_id;
};
#if defined(CONFIG_IO_URING_ZCRX)
--
2.43.5
next prev parent reply other threads:[~2024-10-29 23:06 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-29 23:05 [PATCH v7 00/15] io_uring zero copy rx David Wei
2024-10-29 23:05 ` [PATCH v7 01/15] net: prefix devmem specific helpers David Wei
2024-10-29 23:05 ` [PATCH v7 02/15] net: generalise net_iov chunk owners David Wei
2024-10-29 23:05 ` [PATCH v7 03/15] net: page_pool: create hooks for custom page providers David Wei
2024-10-29 23:05 ` [PATCH v7 04/15] net: prepare for non devmem TCP memory providers David Wei
2024-10-29 23:05 ` [PATCH v7 05/15] net: page_pool: add ->scrub mem provider callback David Wei
2024-10-29 23:05 ` [PATCH v7 06/15] net: page pool: add helper creating area from pages David Wei
2024-10-29 23:05 ` [PATCH v7 07/15] net: page_pool: introduce page_pool_mp_return_in_cache David Wei
2024-10-29 23:05 ` [PATCH v7 08/15] net: add helper executing custom callback from napi David Wei
2024-10-29 23:05 ` [PATCH v7 09/15] io_uring/zcrx: add interface queue and refill queue David Wei
2024-10-29 23:05 ` [PATCH v7 10/15] io_uring/zcrx: add io_zcrx_area David Wei
2024-10-29 23:05 ` [PATCH v7 11/15] io_uring/zcrx: implement zerocopy receive pp memory provider David Wei
2024-10-29 23:05 ` [PATCH v7 12/15] io_uring/zcrx: add io_recvzc request David Wei
2024-10-29 23:05 ` [PATCH v7 13/15] io_uring/zcrx: set pp memory provider for an rx queue David Wei
2024-10-29 23:05 ` David Wei [this message]
2024-10-29 23:05 ` [PATCH v7 15/15] io_uring/zcrx: throttle receive requests David Wei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox