public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: Jens Axboe <[email protected]>, Pavel Begunkov <[email protected]>
Cc: [email protected], Mina Almasry <[email protected]>,
	Jakub Kicinski <[email protected]>
Subject: [PATCH 06/11] io_uring: add ZC pool API
Date: Fri, 25 Aug 2023 15:55:45 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: David Wei <[email protected]>

This patch adds an API to get/put bufs from a ZC pool added in the
previous patch.

Recall that there is an rbuf refill ring in an ifq that is shared w/
userspace, which puts bufs it is done with back into it. A new tier is
added to the ZC pool that drains entries from the refill ring to put
into the cache. So when the cache is empty, it is refilled from the
refill ring first, then the freelist.

ZC bufs are refcounted, with both a kref and a uref. Userspace is given
an off + len into the entire ZC pool region, not individual pages from
ZC bufs. A net device may pack multiple packets into the same page it
gets from a ZC buf, so it is possible for the same ZC buf to be handed
out to userspace multiple times.

This means it is possible to drain the entire refill ring, and have no
usable free bufs. Suggestions for dealing w/ this are very welcome!

Only up to POOL_REFILL_COUNT entries are refilled from the refill ring.
Given the above, we may want to limit the amount of work being done
since refilling happens inside the NAPI softirq context.

Signed-off-by: David Wei <[email protected]>
Co-developed-by: Jonathan Lemon <[email protected]>
---
 include/linux/io_uring.h | 18 ++++++++
 io_uring/zc_rx.c         | 98 ++++++++++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h         | 13 ++++++
 3 files changed, 129 insertions(+)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index cf1993befa6a..61eae25a8f1d 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -60,6 +60,17 @@ void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
 
+struct io_zc_rx_ifq;
+struct io_zc_rx_buf *io_zc_rx_get_buf(struct io_zc_rx_ifq *ifq);
+void io_zc_rx_put_buf(struct io_zc_rx_ifq *ifq, struct io_zc_rx_buf *buf);
+static inline dma_addr_t io_zc_rx_buf_dma(struct io_zc_rx_buf *buf)
+{
+	return buf->dma;
+}
+static inline struct page *io_zc_rx_buf_page(struct io_zc_rx_buf *buf)
+{
+	return buf->page;
+}
 static inline void io_uring_files_cancel(void)
 {
 	if (current->io_uring) {
@@ -108,6 +119,13 @@ static inline const char *io_uring_get_opcode(u8 opcode)
 {
 	return "";
 }
+static inline struct io_zc_rx_buf *io_zc_rx_get_buf(struct io_zc_rx_ifq *ifq)
+{
+	return NULL;
+}
+void io_zc_rx_put_buf(struct io_zc_rx_ifq *ifq, struct io_zc_rx_buf *buf)
+{
+}
 #endif
 
 #endif
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 317127d0d4e7..14bc063f1c6c 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -14,6 +14,9 @@
 #include "zc_rx.h"
 
 #define POOL_CACHE_SIZE	128
+#define POOL_REFILL_COUNT	64
+#define IO_ZC_RX_UREF		0x10000
+#define IO_ZC_RX_KREF_MASK	(IO_ZC_RX_UREF - 1)
 
 struct io_zc_rx_pool {
 	struct io_zc_rx_ifq  	*ifq;
@@ -267,6 +270,8 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 
 	ifq->rq_entries = reg.rq_entries;
 	ifq->cq_entries = reg.cq_entries;
+	ifq->cached_rq_head = 0;
+	ifq->cached_cq_tail = 0;
 	ifq->if_rxq_id = reg.if_rxq_id;
 	ctx->ifq = ifq;
 
@@ -309,3 +314,96 @@ int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx)
 
 	return 0;
 }
+
+static bool io_zc_rx_put_buf_uref(struct io_zc_rx_buf *buf)
+{
+	if (atomic_read(&buf->refcount) < IO_ZC_RX_UREF)
+		return false;
+
+	return atomic_sub_and_test(IO_ZC_RX_UREF, &buf->refcount);
+}
+
+static void io_zc_rx_refill_cache(struct io_zc_rx_ifq *ifq, int count)
+{
+	unsigned int entries = io_zc_rx_rqring_entries(ifq);
+	unsigned int mask = ifq->rq_entries - 1;
+	struct io_zc_rx_pool *pool = ifq->pool;
+	struct io_uring_rbuf_rqe *rqe;
+	struct io_zc_rx_buf *buf;
+	int i, filled;
+
+	if (!entries)
+		return;
+
+	for (i = 0, filled = 0; i < entries && filled < count; i++) {
+		unsigned int rq_idx = ifq->cached_rq_head++ & mask;
+		u32 pgid;
+
+		rqe = &ifq->rqes[rq_idx];
+		pgid = rqe->off / PAGE_SIZE;
+		buf = &pool->bufs[pgid];
+		if (!io_zc_rx_put_buf_uref(buf))
+			continue;
+		pool->cache[filled++] = pgid;
+	}
+
+	smp_store_release(&ifq->ring->rq.head, ifq->cached_rq_head);
+	pool->cache_count += filled;
+}
+
+struct io_zc_rx_buf *io_zc_rx_get_buf(struct io_zc_rx_ifq *ifq)
+{
+	struct io_zc_rx_pool *pool;
+	struct io_zc_rx_buf *buf;
+	int count;
+	u16 pgid;
+
+	pool = ifq->pool;
+	if (pool->cache_count)
+		goto out;
+
+	io_zc_rx_refill_cache(ifq, POOL_REFILL_COUNT);
+	if (pool->cache_count)
+		goto out;
+
+	spin_lock(&pool->freelist_lock);
+
+	count = min_t(u32, pool->free_count, POOL_CACHE_SIZE);
+	pool->free_count -= count;
+	pool->cache_count += count;
+	memcpy(pool->cache, &pool->freelist[pool->free_count],
+	       count * sizeof(u32));
+
+	spin_unlock(&pool->freelist_lock);
+
+	if (pool->cache_count)
+		goto out;
+
+	return NULL;
+out:
+	pgid = pool->cache[--pool->cache_count];
+	buf = &pool->bufs[pgid];
+	atomic_set(&buf->refcount, 1);
+
+	return buf;
+}
+EXPORT_SYMBOL(io_zc_rx_get_buf);
+
+static void io_zc_rx_recycle_buf(struct io_zc_rx_pool *pool,
+				 struct io_zc_rx_buf *buf)
+{
+	spin_lock(&pool->freelist_lock);
+	pool->freelist[pool->free_count++] = buf - pool->bufs;
+	spin_unlock(&pool->freelist_lock);
+}
+
+void io_zc_rx_put_buf(struct io_zc_rx_ifq *ifq, struct io_zc_rx_buf *buf)
+{
+	struct io_zc_rx_pool *pool = ifq->pool;
+
+	if (!atomic_dec_and_test(&buf->refcount))
+		return;
+
+	io_zc_rx_recycle_buf(pool, buf);
+}
+EXPORT_SYMBOL(io_zc_rx_put_buf);
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 3cd0e730115d..b063a3c81ccb 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -2,6 +2,8 @@
 #ifndef IOU_ZC_RX_H
 #define IOU_ZC_RX_H
 
+#include <linux/io_uring_types.h>
+
 struct io_zc_rx_ifq {
 	struct io_ring_ctx	*ctx;
 	struct net_device	*dev;
@@ -9,12 +11,23 @@ struct io_zc_rx_ifq {
 	struct io_uring_rbuf_rqe *rqes;
 	struct io_uring_rbuf_cqe *cqes;
 	u32			rq_entries, cq_entries;
+	u32			cached_rq_head;
+	u32			cached_cq_tail;
 	void			*pool;
 
 	/* hw rx descriptor ring id */
 	u32			if_rxq_id;
 };
 
+static inline u32 io_zc_rx_rqring_entries(struct io_zc_rx_ifq *ifq)
+{
+	struct io_rbuf_ring *ring = ifq->ring;
+	u32 entries;
+
+	entries = smp_load_acquire(&ring->rq.tail) - ifq->cached_rq_head;
+	return min(entries, ifq->rq_entries);
+}
+
 int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zc_rx_ifq_reg __user *arg);
 int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx);
-- 
2.39.3


  parent reply	other threads:[~2023-08-25 22:57 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-25 22:55 [RFC PATCH 00/11] Zero copy network RX using io_uring David Wei
2023-08-25 22:55 ` [PATCH 01/11] io_uring: add interface queue David Wei
2023-08-25 22:55 ` [PATCH 02/11] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-08-25 22:55 ` [PATCH 03/11] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-08-25 22:55 ` [PATCH 04/11] io_uring: setup ZC for an RX queue when registering an ifq David Wei
2023-08-25 22:55 ` [PATCH 05/11] io_uring: add ZC buf and pool David Wei
2023-08-25 22:55 ` David Wei [this message]
2023-08-25 22:55 ` [PATCH 07/11] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() David Wei
2023-08-25 22:55 ` [PATCH 08/11] io_uring: allocate a uarg for freeing zero copy skbs David Wei
2023-08-25 22:55 ` [PATCH 09/11] io_uring: delay ZC pool destruction David Wei
2023-08-25 22:55 ` [PATCH 10/11] netdev/bnxt: add data pool and use it in BNXT driver David Wei
2023-08-25 22:55 ` [PATCH 11/11] io_uring: add io_recvzc request David Wei
2023-08-26  0:04 ` [RFC PATCH 00/11] Zero copy network RX using io_uring Jakub Kicinski
  -- strict thread matches above, loose matches on Subject: below --
2023-08-26  1:19 [RFC RESEND " David Wei
2023-08-26  1:19 ` [PATCH 06/11] io_uring: add ZC pool API David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox