From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>,
Stanislav Fomichev <[email protected]>,
Joe Damato <[email protected]>,
Pedro Tammela <[email protected]>
Subject: [PATCH net-next v9 10/20] io_uring/zcrx: add interface queue and refill queue
Date: Tue, 17 Dec 2024 16:37:36 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
Add a new object called an interface queue (ifq) that represents a net
rx queue that has been configured for zero copy. Each ifq is registered
using a new registration opcode IORING_REGISTER_ZCRX_IFQ.
The refill queue is allocated by the kernel and mapped by userspace
using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main
SQ/CQ. It is used by userspace to return buffers that it is done with,
which will then be re-used by the netdev again.
The main CQ ring is used to notify userspace of received data by using
the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each
entry contains the offset + len to the data.
For now, each io_uring instance only has a single ifq.
Signed-off-by: David Wei <[email protected]>
---
Kconfig | 2 +
include/linux/io_uring_types.h | 6 ++
include/uapi/linux/io_uring.h | 43 +++++++++-
io_uring/KConfig | 10 +++
io_uring/Makefile | 1 +
io_uring/io_uring.c | 7 ++
io_uring/memmap.h | 1 +
io_uring/register.c | 7 ++
io_uring/zcrx.c | 149 +++++++++++++++++++++++++++++++++
io_uring/zcrx.h | 35 ++++++++
10 files changed, 260 insertions(+), 1 deletion(-)
create mode 100644 io_uring/KConfig
create mode 100644 io_uring/zcrx.c
create mode 100644 io_uring/zcrx.h
diff --git a/Kconfig b/Kconfig
index 745bc773f567..529ea7694ba9 100644
--- a/Kconfig
+++ b/Kconfig
@@ -30,3 +30,5 @@ source "lib/Kconfig"
source "lib/Kconfig.debug"
source "Documentation/Kconfig"
+
+source "io_uring/KConfig"
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 493a8f7fa8e4..f0c6e18d176a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -40,6 +40,8 @@ enum io_uring_cmd_flags {
IO_URING_F_TASK_DEAD = (1 << 13),
};
+struct io_zcrx_ifq;
+
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -384,6 +386,8 @@ struct io_ring_ctx {
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
+ struct io_zcrx_ifq *ifq;
+
u32 pers_next;
struct xarray personalities;
@@ -436,6 +440,8 @@ struct io_ring_ctx {
struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
+ /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
+ struct io_mapped_region zcrx_region;
};
struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 38f0d6b10eaf..3af8b7a19824 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -638,7 +638,8 @@ enum io_uring_register_op {
/* send MSG_RING without having a ring */
IORING_REGISTER_SEND_MSG_RING = 31,
- /* 32 reserved for zc rx */
+ /* register a netdev hw rx queue for zerocopy */
+ IORING_REGISTER_ZCRX_IFQ = 32,
/* resize CQ ring */
IORING_REGISTER_RESIZE_RINGS = 33,
@@ -955,6 +956,46 @@ enum io_uring_socket_op {
SOCKET_URING_OP_SETSOCKOPT,
};
+/* Zero copy receive refill queue entry */
+struct io_uring_zcrx_rqe {
+ __u64 off;
+ __u32 len;
+ __u32 __pad;
+};
+
+struct io_uring_zcrx_cqe {
+ __u64 off;
+ __u64 __pad;
+};
+
+/* The bit from which area id is encoded into offsets */
+#define IORING_ZCRX_AREA_SHIFT 48
+#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
+
+struct io_uring_zcrx_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 rqes;
+ __u32 __resv2;
+ __u64 __resv[2];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZCRX_IFQ
+ */
+struct io_uring_zcrx_ifq_reg {
+ __u32 if_idx;
+ __u32 if_rxq;
+ __u32 rq_entries;
+ __u32 flags;
+
+ __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
+ __u64 region_ptr; /* struct io_uring_region_desc * */
+
+ struct io_uring_zcrx_offsets offsets;
+ __u64 __resv[4];
+};
+
#ifdef __cplusplus
}
#endif
diff --git a/io_uring/KConfig b/io_uring/KConfig
new file mode 100644
index 000000000000..9e2a4beba1ef
--- /dev/null
+++ b/io_uring/KConfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# io_uring configuration
+#
+
+config IO_URING_ZCRX
+ def_bool y
+ depends on PAGE_POOL
+ depends on INET
+ depends on NET_RX_BUSY_POLL
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 53167bef37d7..a95b0b8229c9 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \
truncate.o memmap.o
+obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5535a72b0ce1..0c02a2e97f01 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -97,6 +97,7 @@
#include "uring_cmd.h"
#include "msg_ring.h"
#include "memmap.h"
+#include "zcrx.h"
#include "timeout.h"
#include "poll.h"
@@ -2686,6 +2687,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
io_sqe_buffers_unregister(ctx);
io_sqe_files_unregister(ctx);
+ io_unregister_zcrx_ifqs(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, kfree);
@@ -2851,6 +2853,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
+ if (ctx->ifq) {
+ mutex_lock(&ctx->uring_lock);
+ io_shutdown_zcrx_ifqs(ctx);
+ mutex_unlock(&ctx->uring_lock);
+ }
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index c898dcba2b4e..dad0aa5b1b45 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -2,6 +2,7 @@
#define IO_URING_MEMMAP_H
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
+#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
diff --git a/io_uring/register.c b/io_uring/register.c
index f1698c18c7cb..f9dfdca79a80 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -30,6 +30,7 @@
#include "eventfd.h"
#include "msg_ring.h"
#include "memmap.h"
+#include "zcrx.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -798,6 +799,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_clone_buffers(ctx, arg);
break;
+ case IORING_REGISTER_ZCRX_IFQ:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_zcrx_ifq(ctx, arg);
+ break;
case IORING_REGISTER_RESIZE_RINGS:
ret = -EINVAL;
if (!arg || nr_args != 1)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
new file mode 100644
index 000000000000..f3ace7e8264d
--- /dev/null
+++ b/io_uring/zcrx.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "memmap.h"
+#include "zcrx.h"
+
+#define IO_RQ_MAX_ENTRIES 32768
+
+static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
+ struct io_uring_zcrx_ifq_reg *reg,
+ struct io_uring_region_desc *rd)
+{
+ size_t off, size;
+ void *ptr;
+ int ret;
+
+ off = sizeof(struct io_uring);
+ size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
+ if (size > rd->size)
+ return -EINVAL;
+
+ ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
+ IORING_MAP_OFF_ZCRX_REGION);
+ if (ret < 0)
+ return ret;
+
+ ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+ ifq->rq_ring = (struct io_uring *)ptr;
+ ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
+ return 0;
+}
+
+static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
+{
+ io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+ ifq->rq_ring = NULL;
+ ifq->rqes = NULL;
+}
+
+static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
+{
+ struct io_zcrx_ifq *ifq;
+
+ ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
+ if (!ifq)
+ return NULL;
+
+ ifq->if_rxq = -1;
+ ifq->ctx = ctx;
+ return ifq;
+}
+
+static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
+{
+ io_free_rbuf_ring(ifq);
+ kfree(ifq);
+}
+
+int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg)
+{
+ struct io_uring_zcrx_ifq_reg reg;
+ struct io_uring_region_desc rd;
+ struct io_zcrx_ifq *ifq;
+ int ret;
+
+ /*
+ * 1. Interface queue allocation.
+ * 2. It can observe data destined for sockets of other tasks.
+ */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* mandatory io_uring features for zc rx */
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
+ ctx->flags & IORING_SETUP_CQE32))
+ return -EINVAL;
+ if (ctx->ifq)
+ return -EBUSY;
+ if (copy_from_user(®, arg, sizeof(reg)))
+ return -EFAULT;
+ if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
+ return -EFAULT;
+ if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
+ return -EINVAL;
+ if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
+ return -EINVAL;
+ if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
+ if (!(ctx->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ reg.rq_entries = IO_RQ_MAX_ENTRIES;
+ }
+ reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
+
+ if (!reg.area_ptr)
+ return -EFAULT;
+
+ ifq = io_zcrx_ifq_alloc(ctx);
+ if (!ifq)
+ return -ENOMEM;
+
+ ret = io_allocate_rbuf_ring(ifq, ®, &rd);
+ if (ret)
+ goto err;
+
+ ifq->rq_entries = reg.rq_entries;
+ ifq->if_rxq = reg.if_rxq;
+
+ reg.offsets.rqes = sizeof(struct io_uring);
+ reg.offsets.head = offsetof(struct io_uring, head);
+ reg.offsets.tail = offsetof(struct io_uring, tail);
+
+ if (copy_to_user(arg, ®, sizeof(reg)) ||
+ copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ctx->ifq = ifq;
+ return 0;
+err:
+ io_zcrx_ifq_free(ifq);
+ return ret;
+}
+
+void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+ struct io_zcrx_ifq *ifq = ctx->ifq;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ if (!ifq)
+ return;
+
+ ctx->ifq = NULL;
+ io_zcrx_ifq_free(ifq);
+}
+
+void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+}
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
new file mode 100644
index 000000000000..58e4ab6c6083
--- /dev/null
+++ b/io_uring/zcrx.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_ZC_RX_H
+#define IOU_ZC_RX_H
+
+#include <linux/io_uring_types.h>
+
+struct io_zcrx_ifq {
+ struct io_ring_ctx *ctx;
+ struct io_uring *rq_ring;
+ struct io_uring_zcrx_rqe *rqes;
+ u32 rq_entries;
+
+ u32 if_rxq;
+};
+
+#if defined(CONFIG_IO_URING_ZCRX)
+int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg);
+void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
+void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
+#else
+static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+#endif
+
+#endif
--
2.43.5
next prev parent reply other threads:[~2024-12-18 0:38 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-18 0:37 [PATCH RESEND net-next v9 00/21] io_uring zero copy rx David Wei
2024-12-18 0:37 ` [PATCH net-next v9 01/20] net: page_pool: don't cast mp param to devmem David Wei
2024-12-20 22:04 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 02/20] net: prefix devmem specific helpers David Wei
2024-12-20 22:05 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 03/20] net: generalise net_iov chunk owners David Wei
2024-12-20 22:14 ` Jakub Kicinski
2024-12-21 0:50 ` Pavel Begunkov
2024-12-21 2:17 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 04/20] net: page_pool: create hooks for custom page providers David Wei
2024-12-18 0:37 ` [PATCH net-next v9 05/20] net: page_pool: add mp op for netlink reporting David Wei
2024-12-20 22:16 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 06/20] net: page_pool: add a mp hook to unregister_netdevice* David Wei
2024-12-20 22:18 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 07/20] net: prepare for non devmem TCP memory providers David Wei
2024-12-20 22:18 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 08/20] net: expose page_pool_{set,clear}_pp_info David Wei
2024-12-20 22:31 ` Jakub Kicinski
2024-12-21 1:07 ` Pavel Begunkov
2024-12-21 2:23 ` Jakub Kicinski
2024-12-18 0:37 ` [PATCH net-next v9 09/20] net: page_pool: introduce page_pool_mp_return_in_cache David Wei
2024-12-18 0:37 ` David Wei [this message]
2024-12-18 0:37 ` [PATCH net-next v9 11/20] io_uring/zcrx: add io_zcrx_area David Wei
2024-12-18 0:37 ` [PATCH net-next v9 12/20] io_uring/zcrx: grab a net device David Wei
2024-12-18 0:37 ` [PATCH net-next v9 13/20] net: page pool: export page_pool_set_dma_addr_netmem() David Wei
2024-12-18 0:37 ` [PATCH net-next v9 14/20] io_uring/zcrx: dma-map area for the device David Wei
2024-12-20 22:38 ` Jakub Kicinski
2024-12-21 1:04 ` Pavel Begunkov
2024-12-18 0:37 ` [PATCH net-next v9 15/20] io_uring/zcrx: add io_recvzc request David Wei
2024-12-18 0:37 ` [PATCH net-next v9 16/20] io_uring/zcrx: set pp memory provider for an rx queue David Wei
2024-12-18 0:37 ` [PATCH net-next v9 17/20] io_uring/zcrx: throttle receive requests David Wei
2024-12-18 0:37 ` [PATCH net-next v9 18/20] io_uring/zcrx: add copy fallback David Wei
2024-12-18 0:37 ` [PATCH net-next v9 19/20] net: add documentation for io_uring zcrx David Wei
2024-12-18 0:37 ` [PATCH net-next v9 20/20] io_uring/zcrx: add selftest David Wei
2024-12-18 14:40 ` [PATCH RESEND net-next v9 00/21] io_uring zero copy rx Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox