From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>
Subject: [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx
Date: Tue, 12 Mar 2024 14:44:25 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Pavel Begunkov <[email protected]>
Implement a new pp memory provider for io_uring zerocopy receive.
All buffers are backed by struct io_zc_rx_buf, which is a thin extension
of struct net_iov. Initially, all of them are unallocated and placed in
a spinlock protected ->freelist. Then, they will be allocate via
the ->alloc_pages callback, which sets refcount to 1.
Later, buffers would either be dropped by the net stack and recycled
back into page pool / released by ->release_page, or, more likely, get
transferred to the userspace by posting a corresponding CQE and
elevating refcount by IO_ZC_RX_UREF. When the user is done with a buffer,
it should be put into the refill ring.
Next time io_pp_zc_alloc_pages() runs it'll check the ring, put user
refs and ultimately grab buffers from there. That's done in the attached
napi context and so doesn't need any additional synchronisation. That is
the second hottest path after getting a buffer from the pp lockless cache.
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
include/linux/io_uring/net.h | 5 +
include/net/page_pool/types.h | 1 +
io_uring/zc_rx.c | 202 ++++++++++++++++++++++++++++++++++
io_uring/zc_rx.h | 5 +
net/core/page_pool.c | 2 +-
5 files changed, 214 insertions(+), 1 deletion(-)
diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
index 05d5a6a97264..a225d7090b6b 100644
--- a/include/linux/io_uring/net.h
+++ b/include/linux/io_uring/net.h
@@ -12,6 +12,11 @@ struct io_zc_rx_buf {
};
#if defined(CONFIG_IO_URING)
+
+#if defined(CONFIG_PAGE_POOL)
+extern const struct memory_provider_ops io_uring_pp_zc_ops;
+#endif
+
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
#else
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 347837b83d36..9e91f2cdbe61 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -227,6 +227,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
struct page_pool *page_pool_create(const struct page_pool_params *params);
struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
int cpuid);
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
struct xdp_mem_info;
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 326ae3fcc643..b2507df121fb 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -8,6 +8,7 @@
#include <linux/nospec.h>
#include <net/tcp.h>
#include <net/af_unix.h>
+#include <trace/events/page_pool.h>
#include <uapi/linux/io_uring.h>
@@ -357,4 +358,205 @@ int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
return 0;
}
+static inline struct io_zc_rx_buf *io_niov_to_buf(struct net_iov *niov)
+{
+ return container_of(niov, struct io_zc_rx_buf, niov);
+}
+
+static inline unsigned io_buf_pgid(struct io_zc_rx_pool *pool,
+ struct io_zc_rx_buf *buf)
+{
+ return buf - pool->bufs;
+}
+
+static __maybe_unused void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
+{
+ atomic_long_add(IO_ZC_RX_UREF, &buf->niov.pp_ref_count);
+}
+
+static bool io_zc_rx_buf_put(struct io_zc_rx_buf *buf, int nr)
+{
+ return atomic_long_sub_and_test(nr, &buf->niov.pp_ref_count);
+}
+
+static bool io_zc_rx_put_buf_uref(struct io_zc_rx_buf *buf)
+{
+ if (atomic_long_read(&buf->niov.pp_ref_count) < IO_ZC_RX_UREF)
+ return false;
+
+ return io_zc_rx_buf_put(buf, IO_ZC_RX_UREF);
+}
+
+static inline netmem_ref io_zc_buf_to_netmem(struct io_zc_rx_buf *buf)
+{
+ return net_iov_to_netmem(&buf->niov);
+}
+
+static inline void io_zc_add_pp_cache(struct page_pool *pp,
+ struct io_zc_rx_buf *buf)
+{
+ netmem_ref netmem = io_zc_buf_to_netmem(buf);
+
+ page_pool_set_pp_info(pp, netmem);
+ pp->alloc.cache[pp->alloc.count++] = netmem;
+}
+
+static inline u32 io_zc_rx_rqring_entries(struct io_zc_rx_ifq *ifq)
+{
+ u32 entries;
+
+ entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
+ return min(entries, ifq->rq_entries);
+}
+
+static void io_zc_rx_ring_refill(struct page_pool *pp,
+ struct io_zc_rx_ifq *ifq)
+{
+ unsigned int entries = io_zc_rx_rqring_entries(ifq);
+ unsigned int mask = ifq->rq_entries - 1;
+ struct io_zc_rx_pool *pool = ifq->pool;
+
+ if (unlikely(!entries))
+ return;
+
+ while (entries--) {
+ unsigned int rq_idx = ifq->cached_rq_head++ & mask;
+ struct io_uring_rbuf_rqe *rqe = &ifq->rqes[rq_idx];
+ u32 pgid = rqe->off / PAGE_SIZE;
+ struct io_zc_rx_buf *buf = &pool->bufs[pgid];
+
+ if (!io_zc_rx_put_buf_uref(buf))
+ continue;
+ io_zc_add_pp_cache(pp, buf);
+ if (pp->alloc.count >= PP_ALLOC_CACHE_REFILL)
+ break;
+ }
+ smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
+}
+
+static void io_zc_rx_refill_slow(struct page_pool *pp, struct io_zc_rx_ifq *ifq)
+{
+ struct io_zc_rx_pool *pool = ifq->pool;
+
+ spin_lock_bh(&pool->freelist_lock);
+ while (pool->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
+ struct io_zc_rx_buf *buf;
+ u32 pgid;
+
+ pgid = pool->freelist[--pool->free_count];
+ buf = &pool->bufs[pgid];
+
+ io_zc_add_pp_cache(pp, buf);
+ pp->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pp, io_zc_buf_to_netmem(buf),
+ pp->pages_state_hold_cnt);
+ }
+ spin_unlock_bh(&pool->freelist_lock);
+}
+
+static void io_zc_rx_recycle_buf(struct io_zc_rx_pool *pool,
+ struct io_zc_rx_buf *buf)
+{
+ spin_lock_bh(&pool->freelist_lock);
+ pool->freelist[pool->free_count++] = io_buf_pgid(pool, buf);
+ spin_unlock_bh(&pool->freelist_lock);
+}
+
+static netmem_ref io_pp_zc_alloc_pages(struct page_pool *pp, gfp_t gfp)
+{
+ struct io_zc_rx_ifq *ifq = pp->mp_priv;
+
+ /* pp should already be ensuring that */
+ if (unlikely(pp->alloc.count))
+ goto out_return;
+
+ io_zc_rx_ring_refill(pp, ifq);
+ if (likely(pp->alloc.count))
+ goto out_return;
+
+ io_zc_rx_refill_slow(pp, ifq);
+ if (!pp->alloc.count)
+ return 0;
+out_return:
+ return pp->alloc.cache[--pp->alloc.count];
+}
+
+static bool io_pp_zc_release_page(struct page_pool *pp, netmem_ref netmem)
+{
+ struct io_zc_rx_ifq *ifq = pp->mp_priv;
+ struct io_zc_rx_buf *buf;
+ struct net_iov *niov;
+
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return false;
+
+ niov = netmem_to_net_iov(netmem);
+ buf = io_niov_to_buf(niov);
+
+ if (io_zc_rx_buf_put(buf, 1))
+ io_zc_rx_recycle_buf(ifq->pool, buf);
+ return false;
+}
+
+static void io_pp_zc_scrub(struct page_pool *pp)
+{
+ struct io_zc_rx_ifq *ifq = pp->mp_priv;
+ struct io_zc_rx_pool *pool = ifq->pool;
+ int i;
+
+ for (i = 0; i < pool->nr_bufs; i++) {
+ struct io_zc_rx_buf *buf = &pool->bufs[i];
+ int count;
+
+ if (!io_zc_rx_put_buf_uref(buf))
+ continue;
+ io_zc_rx_recycle_buf(pool, buf);
+
+ count = atomic_inc_return_relaxed(&pp->pages_state_release_cnt);
+ trace_page_pool_state_release(pp, io_zc_buf_to_netmem(buf), count);
+ }
+}
+
+static int io_pp_zc_init(struct page_pool *pp)
+{
+ struct io_zc_rx_ifq *ifq = pp->mp_priv;
+
+ if (!ifq)
+ return -EINVAL;
+ if (pp->p.order != 0)
+ return -EINVAL;
+ if (!pp->p.napi)
+ return -EINVAL;
+ if (pp->p.flags & PP_FLAG_DMA_MAP)
+ return -EOPNOTSUPP;
+ if (pp->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ return -EOPNOTSUPP;
+
+ percpu_ref_get(&ifq->ctx->refs);
+ ifq->pp = pp;
+ return 0;
+}
+
+static void io_pp_zc_destroy(struct page_pool *pp)
+{
+ struct io_zc_rx_ifq *ifq = pp->mp_priv;
+ struct io_zc_rx_pool *pool = ifq->pool;
+
+ ifq->pp = NULL;
+
+ if (WARN_ON_ONCE(pool->free_count != pool->nr_bufs))
+ return;
+ percpu_ref_put(&ifq->ctx->refs);
+}
+
+const struct memory_provider_ops io_uring_pp_zc_ops = {
+ .alloc_pages = io_pp_zc_alloc_pages,
+ .release_page = io_pp_zc_release_page,
+ .init = io_pp_zc_init,
+ .destroy = io_pp_zc_destroy,
+ .scrub = io_pp_zc_scrub,
+};
+EXPORT_SYMBOL(io_uring_pp_zc_ops);
+
+
#endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 466b2b8f9813..c02bf8cabc6c 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -10,6 +10,9 @@
#define IO_ZC_IFQ_IDX_OFFSET 16
#define IO_ZC_IFQ_IDX_MASK ((1U << IO_ZC_IFQ_IDX_OFFSET) - 1)
+#define IO_ZC_RX_UREF 0x10000
+#define IO_ZC_RX_KREF_MASK (IO_ZC_RX_UREF - 1)
+
struct io_zc_rx_pool {
struct io_zc_rx_ifq *ifq;
struct io_zc_rx_buf *bufs;
@@ -26,10 +29,12 @@ struct io_zc_rx_ifq {
struct io_ring_ctx *ctx;
struct net_device *dev;
struct io_zc_rx_pool *pool;
+ struct page_pool *pp;
struct io_uring *rq_ring;
struct io_uring_rbuf_rqe *rqes;
u32 rq_entries;
+ u32 cached_rq_head;
/* hw rx descriptor ring id */
u32 if_rxq_id;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index fc92e551ed13..f83ddbb4ebd8 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -460,7 +460,7 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
return false;
}
-static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
{
netmem_set_pp(netmem, pool);
netmem_or_pp_magic(netmem, PP_SIGNATURE);
--
2.43.0
next prev parent reply other threads:[~2024-03-12 21:44 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-12 21:44 [RFC PATCH v4 00/16] Zero copy Rx using io_uring David Wei
2024-03-12 21:44 ` [RFC PATCH v4 01/16] net: generalise pp provider params passing David Wei
2024-03-12 21:44 ` [RFC PATCH v4 02/16] io_uring: delayed cqe commit David Wei
2024-03-12 21:44 ` [RFC PATCH v4 03/16] net: page_pool: add ->scrub mem provider callback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 04/16] io_uring: separate header for exported net bits David Wei
2024-03-12 21:44 ` [RFC PATCH v4 05/16] io_uring: introduce interface queue David Wei
2024-03-12 21:44 ` [RFC PATCH v4 06/16] io_uring: add mmap support for shared ifq ringbuffers David Wei
2024-03-12 21:44 ` [RFC PATCH v4 07/16] netdev: add XDP_SETUP_ZC_RX command David Wei
2024-03-12 21:44 ` [RFC PATCH v4 08/16] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2024-03-12 21:44 ` [RFC PATCH v4 09/16] io_uring/zcrx: implement socket registration David Wei
2024-03-12 21:44 ` [RFC PATCH v4 10/16] io_uring: add zero copy buf representation and pool David Wei
2024-03-12 21:44 ` David Wei [this message]
2024-03-12 21:44 ` [RFC PATCH v4 12/16] io_uring/zcrx: implement PP_FLAG_DMA_* handling David Wei
2024-03-12 21:44 ` [RFC PATCH v4 13/16] io_uring: add io_recvzc request David Wei
2024-03-13 20:25 ` Jens Axboe
2024-03-13 20:26 ` Pavel Begunkov
2024-03-13 21:03 ` Jens Axboe
2024-03-14 16:14 ` Jens Axboe
2024-03-15 17:34 ` Pavel Begunkov
2024-03-15 18:38 ` Jens Axboe
2024-03-15 23:52 ` Pavel Begunkov
2024-03-16 16:59 ` Jens Axboe
2024-03-17 21:22 ` Pavel Begunkov
2024-03-17 21:30 ` Jens Axboe
2024-03-12 21:44 ` [RFC PATCH v4 14/16] net: execute custom callback from napi David Wei
2024-03-12 21:44 ` [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 16/16] veth: add support for io_uring zc rx David Wei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox