From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>,
Stanislav Fomichev <[email protected]>,
Joe Damato <[email protected]>,
Pedro Tammela <[email protected]>
Subject: [PATCH net-next v11 14/21] io_uring/zcrx: implement zerocopy receive pp memory provider
Date: Thu, 16 Jan 2025 15:16:56 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Pavel Begunkov <[email protected]>
Implement a page pool memory provider for io_uring to receieve in a
zero copy fashion. For that, the provider allocates user pages wrapped
around into struct net_iovs, that are stored in a previously registered
struct net_iov_area.
Unlike the traditional receive, that frees pages and returns them back
to the page pool right after data was copied to the user, e.g. inside
recv(2), we extend the lifetime until the user space confirms that it's
done processing the data. That's done by taking a net_iov reference.
When the user is done with the buffer, it must return it back to the
kernel by posting an entry into the refill ring, which is usually polled
off the io_uring memory provider callback in the page pool's netmem
allocation path.
There is also a separate set of per net_iov "user" references accounting
whether a buffer is currently given to the user (including possible
fragmentation).
Reviewed-by: Jens Axboe <[email protected]>
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
io_uring/zcrx.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++
io_uring/zcrx.h | 3 +
2 files changed, 272 insertions(+)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 435cd634f91c..c007ff80b325 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -2,10 +2,17 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
+#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/netlink.h>
+
+#include <trace/events/page_pool.h>
+
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
@@ -16,6 +23,33 @@
#define IO_RQ_MAX_ENTRIES 32768
+__maybe_unused
+static const struct memory_provider_ops io_uring_pp_zc_ops;
+
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return container_of(owner, struct io_zcrx_area, nia);
+}
+
+static inline atomic_t *io_get_user_counter(struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ return &area->user_refs[net_iov_idx(niov)];
+}
+
+static bool io_zcrx_put_niov_uref(struct net_iov *niov)
+{
+ atomic_t *uref = io_get_user_counter(niov);
+
+ if (unlikely(!atomic_read(uref)))
+ return false;
+ atomic_dec(uref);
+ return true;
+}
+
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
struct io_uring_region_desc *rd)
@@ -51,6 +85,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
{
kvfree(area->freelist);
kvfree(area->nia.niovs);
+ kvfree(area->user_refs);
if (area->pages) {
unpin_user_pages(area->pages, area->nia.num_niovs);
kvfree(area->pages);
@@ -106,6 +141,19 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
for (i = 0; i < nr_pages; i++)
area->freelist[i] = i;
+ area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!area->user_refs)
+ goto err;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct net_iov *niov = &area->nia.niovs[i];
+
+ niov->owner = &area->nia;
+ area->freelist[i] = i;
+ atomic_set(&area->user_refs[i], 0);
+ }
+
area->free_count = nr_pages;
area->ifq = ifq;
/* we're only supporting one area per ifq for now */
@@ -131,6 +179,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
ifq->if_rxq = -1;
ifq->ctx = ctx;
spin_lock_init(&ifq->lock);
+ spin_lock_init(&ifq->rq_lock);
return ifq;
}
@@ -256,7 +305,227 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
io_zcrx_ifq_free(ifq);
}
+static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
+{
+ unsigned niov_idx;
+
+ lockdep_assert_held(&area->freelist_lock);
+
+ niov_idx = area->freelist[--area->free_count];
+ return &area->nia.niovs[niov_idx];
+}
+
+static void io_zcrx_return_niov_freelist(struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ spin_lock_bh(&area->freelist_lock);
+ area->freelist[area->free_count++] = net_iov_idx(niov);
+ spin_unlock_bh(&area->freelist_lock);
+}
+
+static void io_zcrx_return_niov(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
+}
+
+static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
+{
+ struct io_zcrx_area *area = ifq->area;
+ int i;
+
+ if (!area)
+ return;
+
+ /* Reclaim back all buffers given to the user space. */
+ for (i = 0; i < area->nia.num_niovs; i++) {
+ struct net_iov *niov = &area->nia.niovs[i];
+ int nr;
+
+ if (!atomic_read(io_get_user_counter(niov)))
+ continue;
+ nr = atomic_xchg(io_get_user_counter(niov), 0);
+ if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
+ io_zcrx_return_niov(niov);
+ }
+}
+
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
lockdep_assert_held(&ctx->uring_lock);
+
+ if (ctx->ifq)
+ io_zcrx_scrub(ctx->ifq);
+}
+
+static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
+{
+ u32 entries;
+
+ entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+ return min(entries, ifq->rq_entries);
}
+
+static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
+ unsigned mask)
+{
+ unsigned int idx = ifq->cached_rq_head++ & mask;
+
+ return &ifq->rqes[idx];
+}
+
+static void io_zcrx_ring_refill(struct page_pool *pp,
+ struct io_zcrx_ifq *ifq)
+{
+ unsigned int mask = ifq->rq_entries - 1;
+ unsigned int entries;
+ netmem_ref netmem;
+
+ spin_lock_bh(&ifq->rq_lock);
+
+ entries = io_zcrx_rqring_entries(ifq);
+ entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
+ if (unlikely(!entries)) {
+ spin_unlock_bh(&ifq->rq_lock);
+ return;
+ }
+
+ do {
+ struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
+ struct io_zcrx_area *area;
+ struct net_iov *niov;
+ unsigned niov_idx, area_idx;
+
+ area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
+ niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT;
+
+ if (unlikely(rqe->__pad || area_idx))
+ continue;
+ area = ifq->area;
+
+ if (unlikely(niov_idx >= area->nia.num_niovs))
+ continue;
+ niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
+
+ niov = &area->nia.niovs[niov_idx];
+ if (!io_zcrx_put_niov_uref(niov))
+ continue;
+
+ netmem = net_iov_to_netmem(niov);
+ if (page_pool_unref_netmem(netmem, 1) != 0)
+ continue;
+
+ if (unlikely(niov->pp != pp)) {
+ io_zcrx_return_niov(niov);
+ continue;
+ }
+
+ net_mp_netmem_place_in_cache(pp, netmem);
+ } while (--entries);
+
+ smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+ spin_unlock_bh(&ifq->rq_lock);
+}
+
+static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
+{
+ struct io_zcrx_area *area = ifq->area;
+
+ spin_lock_bh(&area->freelist_lock);
+ while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+ struct net_iov *niov = __io_zcrx_get_free_niov(area);
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ net_mp_niov_set_page_pool(pp, niov);
+ net_mp_netmem_place_in_cache(pp, netmem);
+ }
+ spin_unlock_bh(&area->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
+{
+ struct io_zcrx_ifq *ifq = pp->mp_priv;
+
+ /* pp should already be ensuring that */
+ if (unlikely(pp->alloc.count))
+ goto out_return;
+
+ io_zcrx_ring_refill(pp, ifq);
+ if (likely(pp->alloc.count))
+ goto out_return;
+
+ io_zcrx_refill_slow(pp, ifq);
+ if (!pp->alloc.count)
+ return 0;
+out_return:
+ return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return false;
+
+ niov = netmem_to_net_iov(netmem);
+ net_mp_niov_clear_page_pool(niov);
+ io_zcrx_return_niov_freelist(niov);
+ return false;
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+ struct io_zcrx_ifq *ifq = pp->mp_priv;
+
+ if (WARN_ON_ONCE(!ifq))
+ return -EINVAL;
+ if (pp->dma_map)
+ return -EOPNOTSUPP;
+ if (pp->p.order != 0)
+ return -EOPNOTSUPP;
+
+ percpu_ref_get(&ifq->ctx->refs);
+ return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+ struct io_zcrx_ifq *ifq = pp->mp_priv;
+ struct io_zcrx_area *area = ifq->area;
+
+ if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
+ return;
+ percpu_ref_put(&ifq->ctx->refs);
+}
+
+static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq)
+{
+ struct nlattr *nest;
+ int type;
+
+ type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
+ nest = nla_nest_start(rsp, type);
+ nla_nest_end(rsp, nest);
+
+ return 0;
+}
+
+static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
+{
+ struct io_zcrx_ifq *ifq = mp_priv;
+
+ io_zcrx_drop_netdev(ifq);
+}
+
+static const struct memory_provider_ops io_uring_pp_zc_ops = {
+ .alloc_netmems = io_pp_zc_alloc_netmems,
+ .release_netmem = io_pp_zc_release_netmem,
+ .init = io_pp_zc_init,
+ .destroy = io_pp_zc_destroy,
+ .nl_fill = io_pp_nl_fill,
+ .uninstall = io_pp_uninstall,
+};
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 595bca0001d2..6c808240ac91 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -9,6 +9,7 @@
struct io_zcrx_area {
struct net_iov_area nia;
struct io_zcrx_ifq *ifq;
+ atomic_t *user_refs;
u16 area_id;
struct page **pages;
@@ -26,6 +27,8 @@ struct io_zcrx_ifq {
struct io_uring *rq_ring;
struct io_uring_zcrx_rqe *rqes;
u32 rq_entries;
+ u32 cached_rq_head;
+ spinlock_t rq_lock;
u32 if_rxq;
struct device *dev;
--
2.43.5
next prev parent reply other threads:[~2025-01-16 23:17 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-16 23:16 [PATCH net-next v11 00/21] io_uring zero copy rx David Wei
2025-01-16 23:16 ` [PATCH net-next v11 01/21] net: page_pool: don't cast mp param to devmem David Wei
2025-01-16 23:16 ` [PATCH net-next v11 02/21] net: prefix devmem specific helpers David Wei
2025-01-16 23:16 ` [PATCH net-next v11 03/21] net: generalise net_iov chunk owners David Wei
2025-01-16 23:16 ` [PATCH net-next v11 04/21] net: page_pool: create hooks for custom memory providers David Wei
2025-01-17 1:46 ` Jakub Kicinski
2025-01-17 1:48 ` Jakub Kicinski
2025-01-17 2:15 ` David Wei
2025-01-16 23:16 ` [PATCH net-next v11 05/21] netdev: add io_uring memory provider info David Wei
2025-01-16 23:16 ` [PATCH net-next v11 06/21] net: page_pool: add callback for mp info printing David Wei
2025-01-16 23:16 ` [PATCH net-next v11 07/21] net: page_pool: add a mp hook to unregister_netdevice* David Wei
2025-01-16 23:16 ` [PATCH net-next v11 08/21] net: prepare for non devmem TCP memory providers David Wei
2025-01-16 23:16 ` [PATCH net-next v11 09/21] net: page_pool: add memory provider helpers David Wei
2025-01-17 1:47 ` Jakub Kicinski
2025-01-16 23:16 ` [PATCH net-next v11 10/21] net: add helpers for setting a memory provider on an rx queue David Wei
2025-01-17 1:50 ` Jakub Kicinski
2025-01-17 1:52 ` Jakub Kicinski
2025-01-17 2:17 ` David Wei
2025-01-17 2:25 ` Jakub Kicinski
2025-01-17 2:47 ` Pavel Begunkov
2025-01-17 22:11 ` Jakub Kicinski
2025-01-17 23:20 ` Pavel Begunkov
2025-01-18 2:08 ` Jakub Kicinski
2025-01-18 3:09 ` Pavel Begunkov
2025-01-16 23:16 ` [PATCH net-next v11 11/21] io_uring/zcrx: add interface queue and refill queue David Wei
2025-01-16 23:16 ` [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area David Wei
2025-01-16 23:16 ` [PATCH net-next v11 13/21] io_uring/zcrx: grab a net device David Wei
2025-01-16 23:16 ` David Wei [this message]
2025-01-17 2:07 ` [PATCH net-next v11 14/21] io_uring/zcrx: implement zerocopy receive pp memory provider Jakub Kicinski
2025-01-17 2:17 ` David Wei
2025-01-16 23:16 ` [PATCH net-next v11 15/21] io_uring/zcrx: dma-map area for the device David Wei
2025-01-16 23:16 ` [PATCH net-next v11 16/21] io_uring/zcrx: add io_recvzc request David Wei
2025-01-16 23:16 ` [PATCH net-next v11 17/21] io_uring/zcrx: set pp memory provider for an rx queue David Wei
2025-01-17 2:13 ` Jakub Kicinski
2025-01-17 2:38 ` Pavel Begunkov
2025-01-16 23:17 ` [PATCH net-next v11 18/21] io_uring/zcrx: throttle receive requests David Wei
2025-01-16 23:17 ` [PATCH net-next v11 19/21] io_uring/zcrx: add copy fallback David Wei
2025-01-16 23:17 ` [PATCH net-next v11 20/21] net: add documentation for io_uring zcrx David Wei
2025-01-16 23:17 ` [PATCH net-next v11 21/21] io_uring/zcrx: add selftest David Wei
2025-01-17 14:28 ` [PATCH net-next v11 00/21] io_uring zero copy rx Paolo Abeni
2025-01-17 14:42 ` Pavel Begunkov
2025-01-17 16:05 ` Paolo Abeni
2025-01-17 16:19 ` Pavel Begunkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox