From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>
Subject: [RFC PATCH v3 07/20] io_uring: add interface queue
Date: Tue, 19 Dec 2023 13:03:44 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: David Wei <[email protected]>
This patch introduces a new object in io_uring called an interface queue
(ifq) which contains:
* A pool region allocated by userspace and registered w/ io_uring where
Rx data is written to.
* A net device and one specific Rx queue in it that will be configured
for ZC Rx.
* A pair of shared ringbuffers w/ userspace, dubbed registered buf
(rbuf) rings. Each entry contains a pool region id and an offset + len
within that region. The kernel writes entries into the completion ring
to tell userspace where RX data is relative to the start of a region.
Userspace writes entries into the refill ring to tell the kernel when
it is done with the data.
For now, each io_uring instance has a single ifq, and each ifq has a
single pool region associated with one Rx queue.
Add a new opcode to io_uring_register that sets up an ifq. Size and
offsets of shared ringbuffers are returned to userspace for it to mmap.
The implementation will be added in a later patch.
Signed-off-by: David Wei <[email protected]>
---
include/linux/io_uring_types.h | 8 +++
include/uapi/linux/io_uring.h | 51 +++++++++++++++
io_uring/Makefile | 2 +-
io_uring/io_uring.c | 13 ++++
io_uring/zc_rx.c | 116 +++++++++++++++++++++++++++++++++
io_uring/zc_rx.h | 37 +++++++++++
6 files changed, 226 insertions(+), 1 deletion(-)
create mode 100644 io_uring/zc_rx.c
create mode 100644 io_uring/zc_rx.h
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bebab36abce8..e87053b200f2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -38,6 +38,8 @@ enum io_uring_cmd_flags {
IO_URING_F_COMPAT = (1 << 12),
};
+struct io_zc_rx_ifq;
+
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -182,6 +184,10 @@ struct io_rings {
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
+struct io_rbuf_ring {
+ struct io_uring rq, cq;
+};
+
struct io_restriction {
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
@@ -383,6 +389,8 @@ struct io_ring_ctx {
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;
+ struct io_zc_rx_ifq *ifq;
+
/* protected by ->uring_lock */
struct list_head rsrc_ref_list;
struct io_alloc_cache rsrc_node_cache;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1c16f817742..024a6f79323b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -558,6 +558,9 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
+ /* register a network interface queue for zerocopy */
+ IORING_REGISTER_ZC_RX_IFQ = 26,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -750,6 +753,54 @@ enum {
SOCKET_URING_OP_SETSOCKOPT,
};
+struct io_uring_rbuf_rqe {
+ __u32 off;
+ __u32 len;
+ __u16 region;
+ __u8 __pad[6];
+};
+
+struct io_uring_rbuf_cqe {
+ __u32 off;
+ __u32 len;
+ __u16 region;
+ __u8 sock;
+ __u8 flags;
+ __u8 __pad[2];
+};
+
+struct io_rbuf_rqring_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 rqes;
+ __u8 __pad[4];
+};
+
+struct io_rbuf_cqring_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 cqes;
+ __u8 __pad[4];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZC_RX_IFQ
+ */
+struct io_uring_zc_rx_ifq_reg {
+ __u32 if_idx;
+ /* hw rx descriptor ring id */
+ __u32 if_rxq_id;
+ __u32 region_id;
+ __u32 rq_entries;
+ __u32 cq_entries;
+ __u32 flags;
+ __u16 cpu;
+
+ __u32 mmap_sz;
+ struct io_rbuf_rqring_offsets rq_off;
+ struct io_rbuf_cqring_offsets cq_off;
+};
+
#ifdef __cplusplus
}
#endif
diff --git a/io_uring/Makefile b/io_uring/Makefile
index e5be47e4fc3b..6c4b4ed37a1f 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
- notif.o waitid.o
+ notif.o waitid.o zc_rx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1d254f2c997d..7fff01d57e9e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@
#include "notif.h"
#include "waitid.h"
#include "futex.h"
+#include "zc_rx.h"
#include "timeout.h"
#include "poll.h"
@@ -2919,6 +2920,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
return;
mutex_lock(&ctx->uring_lock);
+ io_unregister_zc_rx_ifqs(ctx);
if (ctx->buf_data)
__io_sqe_buffers_unregister(ctx);
if (ctx->file_data)
@@ -3109,6 +3111,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
+ if (ctx->ifq) {
+ mutex_lock(&ctx->uring_lock);
+ io_shutdown_zc_rx_ifqs(ctx);
+ mutex_unlock(&ctx->uring_lock);
+ }
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
@@ -4609,6 +4616,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_file_alloc_range(ctx, arg);
break;
+ case IORING_REGISTER_ZC_RX_IFQ:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_zc_rx_ifq(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
new file mode 100644
index 000000000000..5fc94cad5e3a
--- /dev/null
+++ b/io_uring/zc_rx.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#if defined(CONFIG_PAGE_POOL)
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "zc_rx.h"
+
+static int io_allocate_rbuf_ring(struct io_zc_rx_ifq *ifq,
+ struct io_uring_zc_rx_ifq_reg *reg)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+ size_t off, size, rq_size, cq_size;
+ void *ptr;
+
+ off = sizeof(struct io_rbuf_ring);
+ rq_size = reg->rq_entries * sizeof(struct io_uring_rbuf_rqe);
+ cq_size = reg->cq_entries * sizeof(struct io_uring_rbuf_cqe);
+ size = off + rq_size + cq_size;
+ ptr = (void *) __get_free_pages(gfp, get_order(size));
+ if (!ptr)
+ return -ENOMEM;
+ ifq->ring = (struct io_rbuf_ring *)ptr;
+ ifq->rqes = (struct io_uring_rbuf_rqe *)((char *)ptr + off);
+ ifq->cqes = (struct io_uring_rbuf_cqe *)((char *)ifq->rqes + rq_size);
+ return 0;
+}
+
+static void io_free_rbuf_ring(struct io_zc_rx_ifq *ifq)
+{
+ if (ifq->ring)
+ folio_put(virt_to_folio(ifq->ring));
+}
+
+static struct io_zc_rx_ifq *io_zc_rx_ifq_alloc(struct io_ring_ctx *ctx)
+{
+ struct io_zc_rx_ifq *ifq;
+
+ ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
+ if (!ifq)
+ return NULL;
+
+ ifq->if_rxq_id = -1;
+ ifq->ctx = ctx;
+ return ifq;
+}
+
+static void io_zc_rx_ifq_free(struct io_zc_rx_ifq *ifq)
+{
+ io_free_rbuf_ring(ifq);
+ kfree(ifq);
+}
+
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+ struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+ struct io_uring_zc_rx_ifq_reg reg;
+ struct io_zc_rx_ifq *ifq;
+ int ret;
+
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ return -EINVAL;
+ if (copy_from_user(®, arg, sizeof(reg)))
+ return -EFAULT;
+ if (ctx->ifq)
+ return -EBUSY;
+ if (reg.if_rxq_id == -1)
+ return -EINVAL;
+
+ ifq = io_zc_rx_ifq_alloc(ctx);
+ if (!ifq)
+ return -ENOMEM;
+
+ /* TODO: initialise network interface */
+
+ ret = io_allocate_rbuf_ring(ifq, ®);
+ if (ret)
+ goto err;
+
+ /* TODO: map zc region and initialise zc pool */
+
+ ifq->rq_entries = reg.rq_entries;
+ ifq->cq_entries = reg.cq_entries;
+ ifq->if_rxq_id = reg.if_rxq_id;
+ ctx->ifq = ifq;
+
+ return 0;
+err:
+ io_zc_rx_ifq_free(ifq);
+ return ret;
+}
+
+void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+ struct io_zc_rx_ifq *ifq = ctx->ifq;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ if (!ifq)
+ return;
+
+ ctx->ifq = NULL;
+ io_zc_rx_ifq_free(ifq);
+}
+
+void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+}
+
+#endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
new file mode 100644
index 000000000000..aab57c1a4c5d
--- /dev/null
+++ b/io_uring/zc_rx.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_ZC_RX_H
+#define IOU_ZC_RX_H
+
+struct io_zc_rx_ifq {
+ struct io_ring_ctx *ctx;
+ struct net_device *dev;
+ struct io_rbuf_ring *ring;
+ struct io_uring_rbuf_rqe *rqes;
+ struct io_uring_rbuf_cqe *cqes;
+ u32 rq_entries;
+ u32 cq_entries;
+
+ /* hw rx descriptor ring id */
+ u32 if_rxq_id;
+};
+
+#if defined(CONFIG_PAGE_POOL)
+int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+ struct io_uring_zc_rx_ifq_reg __user *arg);
+void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx);
+void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx);
+#else
+static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
+ struct io_uring_zc_rx_ifq_reg __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+static inline void io_unregister_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_shutdown_zc_rx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+#endif
+
+#endif
--
2.39.3
next prev parent reply other threads:[~2023-12-19 21:04 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-19 21:03 [RFC PATCH v3 00/20] Zero copy Rx using io_uring David Wei
2023-12-19 21:03 ` [RFC PATCH v3 01/20] net: page_pool: add ppiov mangling helper David Wei
2023-12-19 23:22 ` Mina Almasry
2023-12-19 23:59 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 02/20] tcp: don't allow non-devmem originated ppiov David Wei
2023-12-19 23:24 ` Mina Almasry
2023-12-20 1:29 ` Pavel Begunkov
2024-01-02 16:11 ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 03/20] net: page pool: rework ppiov life cycle David Wei
2023-12-19 23:35 ` Mina Almasry
2023-12-20 0:49 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 04/20] net: enable napi_pp_put_page for ppiov David Wei
2023-12-19 21:03 ` [RFC PATCH v3 05/20] net: page_pool: add ->scrub mem provider callback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 06/20] io_uring: separate header for exported net bits David Wei
2023-12-20 16:01 ` Jens Axboe
2023-12-19 21:03 ` David Wei [this message]
2023-12-20 16:13 ` [RFC PATCH v3 07/20] io_uring: add interface queue Jens Axboe
2023-12-20 16:23 ` Pavel Begunkov
2023-12-21 1:44 ` David Wei
2023-12-21 17:57 ` Willem de Bruijn
2023-12-30 16:25 ` Pavel Begunkov
2023-12-31 22:25 ` Willem de Bruijn
2023-12-19 21:03 ` [RFC PATCH v3 08/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-12-20 16:13 ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 09/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-12-19 21:03 ` [RFC PATCH v3 10/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-12-20 16:06 ` Jens Axboe
2023-12-20 16:24 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 11/20] io_uring/zcrx: implement socket registration David Wei
2023-12-19 21:03 ` [RFC PATCH v3 12/20] io_uring: add ZC buf and pool David Wei
2023-12-19 21:03 ` [RFC PATCH v3 13/20] io_uring: implement pp memory provider for zc rx David Wei
2023-12-19 23:44 ` Mina Almasry
2023-12-20 0:39 ` Pavel Begunkov
2023-12-21 19:36 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 14/20] net: page pool: add io_uring memory provider David Wei
2023-12-19 23:39 ` Mina Almasry
2023-12-20 0:04 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 15/20] io_uring: add io_recvzc request David Wei
2023-12-20 16:27 ` Jens Axboe
2023-12-20 17:04 ` Pavel Begunkov
2023-12-20 18:09 ` Jens Axboe
2023-12-21 18:59 ` Pavel Begunkov
2023-12-21 21:32 ` Jens Axboe
2023-12-30 21:15 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 16/20] net: execute custom callback from napi David Wei
2023-12-19 21:03 ` [RFC PATCH v3 17/20] io_uring/zcrx: add copy fallback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 18/20] veth: add support for io_uring zc rx David Wei
2023-12-19 21:03 ` [RFC PATCH v3 19/20] net: page pool: generalise ppiov dma address get David Wei
2023-12-21 19:51 ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 20/20] bnxt: enable io_uring zc page pool David Wei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox