public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>
Subject: [RFC PATCH v4 05/16] io_uring: introduce interface queue
Date: Tue, 12 Mar 2024 14:44:19 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: David Wei <[email protected]>

This patch introduces a new object in io_uring called an interface queue
(ifq) which contains:

* A pool region allocated by userspace and registered w/ io_uring where
  Rx data is transferred to via DMA.
* A net device and one specific Rx queue in it that will be configured
  for ZC Rx.
* A new shared ringbuf w/ userspace called a refill ring. When userspace
  is done with bufs with Rx packet payloads, it writes entries into this
  ring to tell the kernel that bufs can be re-used by the NIC again.
  Each entry in the refill ring is a struct io_uring_rbuf_rqe.

On the completion side, the main CQ ring is used to notify userspace of
recv()'d packets. Big CQEs (32 bytes) are required to support this, as
the upper 16 bytes are used by ZC Rx to store a feature specific struct
io_uring_rbuf_cqe.

Add two new struct types:

1. io_uring_rbuf_rqe - entry in refill ring
2. io_uring_rbuf_cqe - entry in upper 16 bytes of a big CQE

For now, each io_uring instance has a single ifq, and each ifq has a
single pool region associated with one Rx queue.

Add a new opcode and functions to setup and tear down an ifq. Size and
offsets of the shared refill ring are returned to userspace for it to
mmap in the registration struct io_uring_zc_rx_ifq_reg, similar to the
main SQ/CQ rings.

Signed-off-by: David Wei <[email protected]>
---
 include/linux/io_uring_types.h |   4 ++
 include/uapi/linux/io_uring.h  |  40 ++++++++++++
 io_uring/Makefile              |   3 +-
 io_uring/io_uring.c            |   7 +++
 io_uring/register.c            |   7 +++
 io_uring/zc_rx.c               | 109 +++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h               |  35 +++++++++++
 7 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 io_uring/zc_rx.c
 create mode 100644 io_uring/zc_rx.h

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 500772189fee..27e750a02ea5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -39,6 +39,8 @@ enum io_uring_cmd_flags {
 	IO_URING_F_COMPAT		= (1 << 12),
 };
 
+struct io_zc_rx_ifq;
+
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -385,6 +387,8 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
+	struct io_zc_rx_ifq		*ifq;
+
 	/* protected by ->uring_lock */
 	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02b..7b643fe420c5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -575,6 +575,9 @@ enum {
 	IORING_REGISTER_NAPI			= 27,
 	IORING_UNREGISTER_NAPI			= 28,
 
+	/* register a network interface queue for zerocopy */
+	IORING_REGISTER_ZC_RX_IFQ		= 29,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -782,6 +785,43 @@ enum {
 	SOCKET_URING_OP_SETSOCKOPT,
 };
 
+struct io_uring_rbuf_rqe {
+	__u32	off;
+	__u32	len;
+	__u16	region;
+	__u8	__pad[6];
+};
+
+struct io_uring_rbuf_cqe {
+	__u32	off;
+	__u32	len;
+	__u16	region;
+	__u8	__pad[6];
+};
+
+struct io_rbuf_rqring_offsets {
+	__u32	head;
+	__u32	tail;
+	__u32	rqes;
+	__u8	__pad[4];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZC_RX_IFQ
+ */
+struct io_uring_zc_rx_ifq_reg {
+	__u32	if_idx;
+	/* hw rx descriptor ring id */
+	__u32	if_rxq_id;
+	__u32	region_id;
+	__u32	rq_entries;
+	__u32	flags;
+	__u16	cpu;
+
+	__u32	mmap_sz;
+	struct io_rbuf_rqring_offsets rq_off;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 2e1d4e03799c..bb47231c611b 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
 					cancel.o kbuf.o rsrc.o rw.o opdef.o \
-					notif.o waitid.o register.o truncate.o
+					notif.o waitid.o register.o truncate.o \
+					zc_rx.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FUTEX)		+= futex.o
 obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e44c2ef271b9..5614c47cecd9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@
 #include "waitid.h"
 #include "futex.h"
 #include "napi.h"
+#include "zc_rx.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -2861,6 +2862,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		return;
 
 	mutex_lock(&ctx->uring_lock);
+	io_unregister_zc_rx_ifqs(ctx);
 	if (ctx->buf_data)
 		__io_sqe_buffers_unregister(ctx);
 	if (ctx->file_data)
@@ -3032,6 +3034,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 			io_cqring_overflow_kill(ctx);
 			mutex_unlock(&ctx->uring_lock);
 		}
+		if (ctx->ifq) {
+			mutex_lock(&ctx->uring_lock);
+			io_shutdown_zc_rx_ifqs(ctx);
+			mutex_unlock(&ctx->uring_lock);
+		}
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
 			io_move_task_work_from_local(ctx);
diff --git a/io_uring/register.c b/io_uring/register.c
index 99c37775f974..760f0b6a051c 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -27,6 +27,7 @@
 #include "cancel.h"
 #include "kbuf.h"
 #include "napi.h"
+#include "zc_rx.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -563,6 +564,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_napi(ctx, arg);
 		break;
+	case IORING_REGISTER_ZC_RX_IFQ:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_zc_rx_ifq(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
new file mode 100644
index 000000000000..e6c33f94c086
--- /dev/null
+++ b/io_uring/zc_rx.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#if defined(CONFIG_PAGE_POOL)
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "zc_rx.h"
+
+static int io_allocate_rbuf_ring(struct io_zc_rx_ifq *ifq,
+				 struct io_uring_zc_rx_ifq_reg *reg)
+{
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	size_t off, rq_size;
+	void *ptr;
+
+	off = sizeof(struct io_uring);
+	rq_size = reg->rq_entries * sizeof(struct io_uring_rbuf_rqe);
+	ptr = (void *) __get_free_pages(gfp, get_order(off + rq_size));
+	if (!ptr)
+		return -ENOMEM;
+	ifq->rq_ring = (struct io_uring *)ptr;
+	ifq->rqes = (struct io_uring_rbuf_rqe *)((char *)ptr + off);
+	return 0;
+}
+
+static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq)
+{
+	if (ifq->rq_ring)
+		folio_put(virt_to_folio(ifq->rq_ring));
+}
+
+static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
+{
+	struct io_zc_rx_ifq *ifq;
+
+	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
+	if (!ifq)
+		return NULL;
+
+	ifq->if_rxq_id = -1;
+	ifq->ctx = ctx;
+	return ifq;
+}
+
+static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
+{
+	io_free_rbuf_ring(ifq);
+	kfree(ifq);
+}
+
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+	struct io_uring_zc_rx_ifq_reg reg;
+	struct io_zc_rx_ifq *ifq;
+	int ret;
+
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
+	      ctx->flags & IORING_SETUP_CQE32))
+		return -EINVAL;
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (ctx->ifq)
+		return -EBUSY;
+	if (reg.if_rxq_id == -1)
+		return -EINVAL;
+
+	ifq = io_zc_rx_ifq_alloc(ctx);
+	if (!ifq)
+		return -ENOMEM;
+
+	ret = io_allocate_rbuf_ring(ifq, &reg);
+	if (ret)
+		goto err;
+
+	ifq->rq_entries = reg.rq_entries;
+	ifq->if_rxq_id = reg.if_rxq_id;
+	ctx->ifq = ifq;
+
+	return 0;
+err:
+	io_zc_rx_ifq_free(ifq);
+	return ret;
+}
+
+void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+	struct io_zc_rx_ifq *ifq = ctx->ifq;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (!ifq)
+		return;
+
+	ctx->ifq = NULL;
+	io_zc_rx_ifq_free(ifq);
+}
+
+void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+	lockdep_assert_held(&ctx->uring_lock);
+}
+
+#endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
new file mode 100644
index 000000000000..35b019b275e0
--- /dev/null
+++ b/io_uring/zc_rx.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_ZC_RX_H
+#define IOU_ZC_RX_H
+
+struct io_zc_rx_ifq {
+	struct io_ring_ctx		*ctx;
+	struct net_device		*dev;
+	struct io_uring			*rq_ring;
+	struct io_uring_rbuf_rqe 	*rqes;
+	u32				rq_entries;
+
+	/* hw rx descriptor ring id */
+	u32				if_rxq_id;
+};
+
+#if defined(CONFIG_PAGE_POOL)
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg);
+void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx);
+void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx);
+#else
+static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+			  struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+static inline void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+#endif
+
+#endif
-- 
2.43.0


  parent reply	other threads:[~2024-03-12 21:44 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-12 21:44 [RFC PATCH v4 00/16] Zero copy Rx using io_uring David Wei
2024-03-12 21:44 ` [RFC PATCH v4 01/16] net: generalise pp provider params passing David Wei
2024-03-12 21:44 ` [RFC PATCH v4 02/16] io_uring: delayed cqe commit David Wei
2024-03-12 21:44 ` [RFC PATCH v4 03/16] net: page_pool: add ->scrub mem provider callback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 04/16] io_uring: separate header for exported net bits David Wei
2024-03-12 21:44 ` David Wei [this message]
2024-03-12 21:44 ` [RFC PATCH v4 06/16] io_uring: add mmap support for shared ifq ringbuffers David Wei
2024-03-12 21:44 ` [RFC PATCH v4 07/16] netdev: add XDP_SETUP_ZC_RX command David Wei
2024-03-12 21:44 ` [RFC PATCH v4 08/16] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2024-03-12 21:44 ` [RFC PATCH v4 09/16] io_uring/zcrx: implement socket registration David Wei
2024-03-12 21:44 ` [RFC PATCH v4 10/16] io_uring: add zero copy buf representation and pool David Wei
2024-03-12 21:44 ` [RFC PATCH v4 11/16] io_uring: implement pp memory provider for zc rx David Wei
2024-03-12 21:44 ` [RFC PATCH v4 12/16] io_uring/zcrx: implement PP_FLAG_DMA_* handling David Wei
2024-03-12 21:44 ` [RFC PATCH v4 13/16] io_uring: add io_recvzc request David Wei
2024-03-13 20:25   ` Jens Axboe
2024-03-13 20:26     ` Pavel Begunkov
2024-03-13 21:03       ` Jens Axboe
2024-03-14 16:14       ` Jens Axboe
2024-03-15 17:34         ` Pavel Begunkov
2024-03-15 18:38           ` Jens Axboe
2024-03-15 23:52             ` Pavel Begunkov
2024-03-16 16:59               ` Jens Axboe
2024-03-17 21:22                 ` Pavel Begunkov
2024-03-17 21:30                   ` Jens Axboe
2024-03-12 21:44 ` [RFC PATCH v4 14/16] net: execute custom callback from napi David Wei
2024-03-12 21:44 ` [RFC PATCH v4 15/16] io_uring/zcrx: add copy fallback David Wei
2024-03-12 21:44 ` [RFC PATCH v4 16/16] veth: add support for io_uring zc rx David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox