* [PATCH liburing v2 0/4] add basic zero copy receive support
@ 2025-02-18 22:01 David Wei
2025-02-18 22:01 ` [PATCH liburing v2 1/4] liburing: sync io_uring headers David Wei
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: David Wei @ 2025-02-18 22:01 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe, Pavel Begunkov
Add basic support for io_uring zero copy receive in liburing. Besides
the mandatory syncing of necessary liburing.h headers, add a thin
wrapper around the registration op and a unit test.
Users still need to setup by hand e.g. mmap, setup the registration
structs, do the registration and then setup the refill queue struct
io_uring_zcrx_rq.
In the future, I'll add code to hide the implementation details. But for
now, this unblocks the kernel selftest.
Changes in v2:
--------------
* Split out linux/io_uring.h header sync changes
* Move exporting io_uring_register_ifq() to LIBURING_2.10 section
David Wei (4):
liburing: sync io_uring headers
zcrx: sync io_uring headers
zcrx: add basic support
zcrx: add unit test
src/include/liburing.h | 12 +
src/include/liburing/io_uring.h | 62 ++-
src/liburing-ffi.map | 2 +
src/liburing.map | 2 +
src/register.c | 6 +
test/Makefile | 1 +
test/zcrx.c | 918 ++++++++++++++++++++++++++++++++
7 files changed, 1001 insertions(+), 2 deletions(-)
create mode 100644 test/zcrx.c
--
2.43.5
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH liburing v2 1/4] liburing: sync io_uring headers
2025-02-18 22:01 [PATCH liburing v2 0/4] add basic zero copy receive support David Wei
@ 2025-02-18 22:01 ` David Wei
2025-02-18 22:01 ` [PATCH liburing v2 2/4] zcrx: " David Wei
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: David Wei @ 2025-02-18 22:01 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe, Pavel Begunkov
In preparation of syncing headers w/ zero copy rx changes, first sync
other unrelated changes.
Signed-off-by: David Wei <[email protected]>
---
src/include/liburing/io_uring.h | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 765919883cff..452240a6ebb4 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -364,7 +364,7 @@ enum io_uring_op {
* result will be the number of buffers send, with
* the starting buffer ID in cqe->flags as per
* usual for provided buffer usage. The buffers
- * will be contiguous from the starting buffer ID.
+ * will be contigious from the starting buffer ID.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1)
@@ -424,7 +424,7 @@ enum io_uring_msg_ring_flags {
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
- __u64 user_data; /* sqe->user_data submission passed back */
+ __u64 user_data; /* sqe->user_data value passed back */
__s32 res; /* result code for this event */
__u32 flags;
@@ -616,6 +616,10 @@ enum io_uring_register_op {
/* clone registered buffers from source ring to current ring */
IORING_REGISTER_CLONE_BUFFERS = 30,
+ /* send MSG_RING without having a ring */
+ IORING_REGISTER_SEND_MSG_RING = 31,
+
+ /* resize CQ ring */
IORING_REGISTER_RESIZE_RINGS = 33,
IORING_REGISTER_MEM_REGION = 34,
--
2.43.5
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH liburing v2 2/4] zcrx: sync io_uring headers
2025-02-18 22:01 [PATCH liburing v2 0/4] add basic zero copy receive support David Wei
2025-02-18 22:01 ` [PATCH liburing v2 1/4] liburing: sync io_uring headers David Wei
@ 2025-02-18 22:01 ` David Wei
2025-02-18 22:01 ` [PATCH liburing v2 3/4] zcrx: add basic support David Wei
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: David Wei @ 2025-02-18 22:01 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe, Pavel Begunkov
Sync linux/io_uring.h with zcrx changes.
Signed-off-by: David Wei <[email protected]>
---
src/include/liburing/io_uring.h | 54 +++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 452240a6ebb4..d2fcd1d22ea0 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -87,6 +87,7 @@ struct io_uring_sqe {
union {
__s32 splice_fd_in;
__u32 file_index;
+ __u32 zcrx_ifq_idx;
__u32 optlen;
struct {
__u16 addr_len;
@@ -262,6 +263,7 @@ enum io_uring_op {
IORING_OP_FTRUNCATE,
IORING_OP_BIND,
IORING_OP_LISTEN,
+ IORING_OP_RECV_ZC,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -619,6 +621,9 @@ enum io_uring_register_op {
/* send MSG_RING without having a ring */
IORING_REGISTER_SEND_MSG_RING = 31,
+ /* register a netdev hw rx queue for zerocopy */
+ IORING_REGISTER_ZCRX_IFQ = 32,
+
/* resize CQ ring */
IORING_REGISTER_RESIZE_RINGS = 33,
@@ -920,6 +925,55 @@ enum io_uring_socket_op {
SOCKET_URING_OP_SETSOCKOPT,
};
+/* Zero copy receive refill queue entry */
+struct io_uring_zcrx_rqe {
+ __u64 off;
+ __u32 len;
+ __u32 __pad;
+};
+
+struct io_uring_zcrx_cqe {
+ __u64 off;
+ __u64 __pad;
+};
+
+/* The bit from which area id is encoded into offsets */
+#define IORING_ZCRX_AREA_SHIFT 48
+#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
+
+struct io_uring_zcrx_offsets {
+ __u32 head;
+ __u32 tail;
+ __u32 rqes;
+ __u32 __resv2;
+ __u64 __resv[2];
+};
+
+struct io_uring_zcrx_area_reg {
+ __u64 addr;
+ __u64 len;
+ __u64 rq_area_token;
+ __u32 flags;
+ __u32 __resv1;
+ __u64 __resv2[2];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZCRX_IFQ
+ */
+struct io_uring_zcrx_ifq_reg {
+ __u32 if_idx;
+ __u32 if_rxq;
+ __u32 rq_entries;
+ __u32 flags;
+
+ __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
+ __u64 region_ptr; /* struct io_uring_region_desc * */
+
+ struct io_uring_zcrx_offsets offsets;
+ __u64 __resv[4];
+};
+
#ifdef __cplusplus
}
#endif
--
2.43.5
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH liburing v2 3/4] zcrx: add basic support
2025-02-18 22:01 [PATCH liburing v2 0/4] add basic zero copy receive support David Wei
2025-02-18 22:01 ` [PATCH liburing v2 1/4] liburing: sync io_uring headers David Wei
2025-02-18 22:01 ` [PATCH liburing v2 2/4] zcrx: " David Wei
@ 2025-02-18 22:01 ` David Wei
2025-02-18 22:01 ` [PATCH liburing v2 4/4] zcrx: add unit test David Wei
2025-02-18 23:31 ` [PATCH liburing v2 0/4] add basic zero copy receive support Jens Axboe
4 siblings, 0 replies; 6+ messages in thread
From: David Wei @ 2025-02-18 22:01 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe, Pavel Begunkov
Add basic support for zcrx with a thin wrapper around
IORING_REGISTER_ZCRX_IFQ and a struct for the refill queue.
Signed-off-by: David Wei <[email protected]>
---
src/include/liburing.h | 12 ++++++++++++
src/liburing-ffi.map | 2 ++
src/liburing.map | 2 ++
src/register.c | 6 ++++++
4 files changed, 22 insertions(+)
diff --git a/src/include/liburing.h b/src/include/liburing.h
index 49b4edf437b2..6393599cb3bf 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -132,6 +132,16 @@ struct io_uring {
unsigned pad2;
};
+struct io_uring_zcrx_rq {
+ __u32 *khead;
+ __u32 *ktail;
+ __u32 rq_tail;
+ unsigned ring_entries;
+
+ struct io_uring_zcrx_rqe *rqes;
+ void *ring_ptr;
+};
+
/*
* Library interface
*/
@@ -265,6 +275,8 @@ int io_uring_register_file_alloc_range(struct io_uring *ring,
int io_uring_register_napi(struct io_uring *ring, struct io_uring_napi *napi);
int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi);
+int io_uring_register_ifq(struct io_uring *ring,
+ struct io_uring_zcrx_ifq_reg *reg);
int io_uring_register_clock(struct io_uring *ring,
struct io_uring_clock_register *arg);
diff --git a/src/liburing-ffi.map b/src/liburing-ffi.map
index 968ccbc67366..fe14adb6d83f 100644
--- a/src/liburing-ffi.map
+++ b/src/liburing-ffi.map
@@ -227,4 +227,6 @@ LIBURING_2.9 {
} LIBURING_2.8;
LIBURING_2.10 {
+ global:
+ io_uring_register_ifq;
} LIBURING_2.9;
diff --git a/src/liburing.map b/src/liburing.map
index 264a94946e90..d1661d9d61f9 100644
--- a/src/liburing.map
+++ b/src/liburing.map
@@ -113,4 +113,6 @@ LIBURING_2.9 {
} LIBURING_2.8;
LIBURING_2.10 {
+ global:
+ io_uring_register_ifq;
} LIBURING_2.9;
diff --git a/src/register.c b/src/register.c
index 0fff208cd5f5..99337d13135d 100644
--- a/src/register.c
+++ b/src/register.c
@@ -422,6 +422,12 @@ int io_uring_clone_buffers(struct io_uring *dst, struct io_uring *src)
return io_uring_clone_buffers_offset(dst, src, 0, 0, 0, 0);
}
+int io_uring_register_ifq(struct io_uring *ring,
+ struct io_uring_zcrx_ifq_reg *reg)
+{
+ return do_register(ring, IORING_REGISTER_ZCRX_IFQ, reg, 1);
+}
+
int io_uring_resize_rings(struct io_uring *ring, struct io_uring_params *p)
{
unsigned sq_head, sq_tail;
--
2.43.5
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH liburing v2 4/4] zcrx: add unit test
2025-02-18 22:01 [PATCH liburing v2 0/4] add basic zero copy receive support David Wei
` (2 preceding siblings ...)
2025-02-18 22:01 ` [PATCH liburing v2 3/4] zcrx: add basic support David Wei
@ 2025-02-18 22:01 ` David Wei
2025-02-18 23:31 ` [PATCH liburing v2 0/4] add basic zero copy receive support Jens Axboe
4 siblings, 0 replies; 6+ messages in thread
From: David Wei @ 2025-02-18 22:01 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe, Pavel Begunkov
Tests for registration and a basic recv test. No zero copy is actually
happening but it does test the copy fallback.
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
test/Makefile | 1 +
test/zcrx.c | 918 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 919 insertions(+)
create mode 100644 test/zcrx.c
diff --git a/test/Makefile b/test/Makefile
index 64e1480867d4..d92379ad9ed0 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -239,6 +239,7 @@ test_srcs := \
wakeup-hang.c \
wq-aff.c \
xattr.c \
+ zcrx.c \
# EOL
# Please keep this list sorted alphabetically.
diff --git a/test/zcrx.c b/test/zcrx.c
new file mode 100644
index 000000000000..d3221d3c2b83
--- /dev/null
+++ b/test/zcrx.c
@@ -0,0 +1,918 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Simple test case showing using send and recv through io_uring
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <net/if.h>
+
+#include "liburing.h"
+#include "helpers.h"
+
+static unsigned int ifidx, rxq;
+
+/* the hw rxq must consume 128 of these pages, leaving 4 left */
+#define AREA_PAGES 132
+#define PAGE_SIZE 4096
+#define AREA_SZ AREA_PAGES * PAGE_SIZE
+#define RQ_ENTRIES 128
+/* this is one more than the # of free pages after filling hw rxq */
+#define LOOP_COUNT 5
+#define DEV_ENV_VAR "NETIF"
+#define RXQ_ENV_VAR "NETRXQ"
+#define RING_FLAGS (IORING_SETUP_DEFER_TASKRUN | \
+ IORING_SETUP_CQE32 | \
+ IORING_SETUP_SINGLE_ISSUER)
+
+static char str[] = "iv5t4dl500w7wsrf14fsuq8thptto0z7i2q62z1p8dwrv5u4kaxpqhm2rb7bapddi5gfkh7f9695eh46t2o5yap2y43gstbsq3n90bg1i7zx1m4wojoqbuxhsrw4s4y3sh9qp57ovbaa2o9yaqa7d4to2vak1otvgkoxs5t0ovjbe6roginrjeh906kmjn1289jlho9a1bud02ex4xr3cvfcybpl6axnr117p0aesb3070wlvj91en7tpf8nyb1e";
+
+#define MSG_SIZE 512
+
+#define PORT 10202
+#define HOST "127.0.0.1"
+
+static int probe_zcrx(void *area)
+{
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)area,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+ struct io_uring ring;
+ int ret;
+
+ ret = t_create_ring(8, &ring, RING_FLAGS);
+ if (ret == T_SETUP_SKIP) {
+ fprintf(stderr, "required ring flags are not supported, skip\n");
+ return T_EXIT_SKIP;
+ }
+ if (ret) {
+ fprintf(stderr, "probe ring setup failure\n");
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_register_ifq(&ring, ®);
+ if (ret == -EINVAL) {
+ fprintf(stderr, "zcrx is not supported, skip\n");
+ return T_EXIT_SKIP;
+ }
+ if (ret) {
+ fprintf(stderr, "probe zcrx register fail %i\n", ret);
+ return T_EXIT_FAIL;
+ }
+ io_uring_queue_exit(&ring);
+ return T_EXIT_PASS;
+}
+
+static int try_register_ifq(struct io_uring_zcrx_ifq_reg *reg)
+{
+ struct io_uring ring;
+ int ret;
+
+ ret = t_create_ring(8, &ring, RING_FLAGS);
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ exit(T_EXIT_FAIL);
+ }
+
+ ret = io_uring_register_ifq(&ring, reg);
+ io_uring_queue_exit(&ring);
+ return ret;
+}
+
+static int test_invalid_if(void *area)
+{
+ int ret;
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)area,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = -1,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+
+ ret = try_register_ifq(®);
+ if (ret != -EINVAL && ret != -ENODEV) {
+ fprintf(stderr, "registered invalid IF %i\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ reg.if_idx = ifidx;
+ reg.if_rxq = -1;
+
+ ret = try_register_ifq(®);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "registered invalid IFQ %i\n", ret);
+ return T_EXIT_FAIL;
+ }
+ return T_EXIT_PASS;
+}
+
+static int test_invalid_ifq_collision(void *area)
+{
+ struct io_uring ring, ring2;
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)area,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+ int ret;
+
+ ret = t_create_ring(8, &ring, RING_FLAGS);
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+ ret = t_create_ring(8, &ring2, RING_FLAGS);
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "ring2 create failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_register_ifq(&ring, ®);
+ if (ret) {
+ fprintf(stderr, "initial registration failed %i\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ /* register taken ifq */
+ ret = io_uring_register_ifq(&ring, ®);
+ if (!ret) {
+ fprintf(stderr, "registered taken queue\n");
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_register_ifq(&ring2, ®);
+ if (!ret) {
+ fprintf(stderr, "registered taken queue ring2\n");
+ return T_EXIT_FAIL;
+ }
+
+ io_uring_queue_exit(&ring);
+ io_uring_queue_exit(&ring2);
+ return T_EXIT_PASS;
+}
+
+static int test_rq_setup(void *area)
+{
+ int ret;
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)area,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = 0,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+
+ ret = try_register_ifq(®);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "registered 0 rq entries\n");
+ return T_EXIT_FAIL;
+ }
+
+ reg.rq_entries = (__u32)-1;
+
+ ret = try_register_ifq(®);
+ if (ret != -EINVAL) {
+ fprintf(stderr, "registered unlimited nr of rq entries\n");
+ return T_EXIT_FAIL;
+ }
+
+ reg.rq_entries = RQ_ENTRIES - 1;
+ ret = try_register_ifq(®);
+ if (ret != 0) {
+ fprintf(stderr, "ifq registration failed %i\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ if (reg.rq_entries == RQ_ENTRIES - 1) {
+ fprintf(stderr, "registered non pow2 refill entries %i\n",
+ reg.rq_entries);
+ return T_EXIT_FAIL;
+ }
+
+ return T_EXIT_PASS;
+}
+
+static int test_null_area_reg_struct(void)
+{
+ int ret;
+
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)0,
+ };
+
+ ret = try_register_ifq(®);
+ return ret ? T_EXIT_PASS : T_EXIT_FAIL;
+}
+
+static int test_null_area(void)
+{
+ int ret;
+
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)0,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+
+ ret = try_register_ifq(®);
+ return ret ? T_EXIT_PASS : T_EXIT_FAIL;
+}
+
+static int test_misaligned_area(void *area)
+{
+ int ret;
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)(area + 1),
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+
+ if (!try_register_ifq(®))
+ return T_EXIT_FAIL;
+
+ area_reg.addr = (__u64)(unsigned long)area;
+ area_reg.len = AREA_SZ - 1;
+ ret = try_register_ifq(®);
+ return ret ? T_EXIT_PASS : T_EXIT_FAIL;
+}
+
+static int test_larger_than_alloc_area(void *area)
+{
+ int ret;
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)area,
+ .len = AREA_SZ + 4096,
+ .flags = 0,
+ };
+
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+
+ ret = try_register_ifq(®);
+ return ret ? T_EXIT_PASS : T_EXIT_FAIL;
+}
+
+static int test_area_access(void)
+{
+ struct io_uring_zcrx_area_reg area_reg = {
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+ int i, ret;
+ void *area;
+
+ for (i = 0; i < 2; i++) {
+ int ro = i & 1;
+ int prot = ro ? PROT_READ : PROT_WRITE;
+
+ area = mmap(NULL, AREA_SZ, prot,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (area == MAP_FAILED) {
+ perror("mmap");
+ return T_EXIT_FAIL;
+ }
+
+ area_reg.addr = (__u64)(unsigned long)area;
+
+ ret = try_register_ifq(®);
+ if (ret != -EFAULT) {
+ fprintf(stderr, "registered unaccessible memory\n");
+ return T_EXIT_FAIL;
+ }
+
+ munmap(area, AREA_SZ);
+ }
+
+ return T_EXIT_PASS;
+}
+
+static int create_ring_with_ifq(struct io_uring *ring, void *area)
+{
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)area,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+ int ret;
+
+ ret = t_create_ring(128, ring, RING_FLAGS);
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = io_uring_register_ifq(ring, ®);
+ if (ret) {
+ io_uring_queue_exit(ring);
+ fprintf(stderr, "ifq register failed %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+ return 0;
+}
+
+static void test_io_uring_prep_zcrx(struct io_uring_sqe *sqe, int fd, int ifq)
+{
+ io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, fd, NULL, 0, 0);
+ sqe->zcrx_ifq_idx = ifq;
+ sqe->ioprio |= IORING_RECV_MULTISHOT;
+}
+
+static struct io_uring_cqe *submit_and_wait_one(struct io_uring *ring)
+{
+ struct io_uring_cqe *cqe;
+ int ret;
+
+ ret = io_uring_submit(ring);
+ if (ret != 1) {
+ fprintf(stderr, "sqe submit failed: %d\n", ret);
+ return NULL;
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "wait completion %d\n", ret);
+ return NULL;
+ }
+
+ return cqe;
+}
+
+static int test_invalid_invalid_request(void *area)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ struct io_uring ring;
+ int ret, fds[2];
+
+ ret = create_ring_with_ifq(&ring, area);
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "ifq-ring create failed: %d\n", ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = t_create_socket_pair(fds, true);
+ if (ret) {
+ fprintf(stderr, "t_create_socket_pair failed: %d\n", ret);
+ return ret;
+ }
+
+ /* invalid file */
+ sqe = io_uring_get_sqe(&ring);
+ test_io_uring_prep_zcrx(sqe, ring.ring_fd, 0);
+
+ cqe = submit_and_wait_one(&ring);
+ if (!cqe) {
+ fprintf(stderr, "submit_and_wait_one failed\n");
+ return T_EXIT_FAIL;
+ }
+ if (cqe->flags & IORING_CQE_F_MORE) {
+ fprintf(stderr, "unexpected F_MORE for invalid fd\n");
+ return T_EXIT_FAIL;
+ }
+ if (cqe->res != -ENOTSOCK) {
+ fprintf(stderr, "zcrx for non-socket file\n");
+ return T_EXIT_FAIL;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ /* invalid ifq idx */
+ sqe = io_uring_get_sqe(&ring);
+ test_io_uring_prep_zcrx(sqe, fds[0], 1);
+
+ cqe = submit_and_wait_one(&ring);
+ if (!cqe) {
+ fprintf(stderr, "submit_and_wait_one failed\n");
+ return T_EXIT_FAIL;
+ }
+ if (cqe->flags & IORING_CQE_F_MORE) {
+ fprintf(stderr, "unexpected F_MORE for invalid fd\n");
+ return T_EXIT_FAIL;
+ }
+ if (cqe->res != -EINVAL) {
+ fprintf(stderr, "zcrx recv with non-existent zcrx ifq\n");
+ return T_EXIT_FAIL;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ close(fds[0]);
+ close(fds[1]);
+ io_uring_queue_exit(&ring);
+ return 0;
+}
+
+struct recv_data {
+ pthread_barrier_t connect;
+ pthread_barrier_t startup;
+ pthread_barrier_t barrier;
+ pthread_barrier_t finish;
+
+ int accept_fd;
+ char buf[MSG_SIZE];
+ void *area;
+ void *ring_ptr;
+ unsigned int ring_sz;
+ struct io_uring_zcrx_rq rq_ring;
+};
+
+static int recv_prep(struct io_uring *ring, struct recv_data *rd, int *sock)
+{
+ struct sockaddr_in saddr;
+ struct io_uring_sqe *sqe;
+ int sockfd, ret, val, use_fd;
+ socklen_t socklen;
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr.sin_port = htons(PORT);
+
+ sockfd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sockfd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ val = 1;
+ setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+
+ ret = bind(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("bind");
+ goto err;
+ }
+
+ ret = listen(sockfd, 1);
+ if (ret < 0) {
+ perror("listen");
+ goto err;
+ }
+
+ pthread_barrier_wait(&rd->connect);
+
+ socklen = sizeof(saddr);
+ use_fd = accept(sockfd, (struct sockaddr *)&saddr, &socklen);
+ if (use_fd < 0) {
+ perror("accept");
+ goto err;
+ }
+
+ rd->accept_fd = use_fd;
+ pthread_barrier_wait(&rd->startup);
+ pthread_barrier_wait(&rd->barrier);
+
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, use_fd, NULL, 0, 0);
+ sqe->zcrx_ifq_idx = 0;
+ sqe->ioprio |= IORING_RECV_MULTISHOT;
+ sqe->user_data = 2;
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ *sock = sockfd;
+ return 0;
+err:
+ close(sockfd);
+ return 1;
+}
+
+static struct io_uring_zcrx_rqe* get_refill_entry(struct io_uring_zcrx_rq *rq_ring)
+{
+ unsigned mask = rq_ring->ring_entries - 1;
+ struct io_uring_zcrx_rqe* rqe;
+
+ rqe = &rq_ring->rqes[rq_ring->rq_tail & mask];
+ rq_ring->rq_tail++;
+ return rqe;
+}
+
+static void refill_garbage(struct recv_data *rd, uint64_t area_token)
+{
+ struct io_uring_zcrx_rq *rq_ring = &rd->rq_ring;
+ struct io_uring_zcrx_rqe* rqe;
+ int i = 0;
+
+ /* invalid area */
+ rqe = get_refill_entry(rq_ring);
+ rqe->off = (area_token + 1) << IORING_ZCRX_AREA_SHIFT;
+ i++;
+
+ /* invalid area offset */
+ rqe = get_refill_entry(rq_ring);
+ rqe->off = AREA_SZ | (area_token << IORING_ZCRX_AREA_SHIFT);
+ rqe->off += AREA_SZ;
+ i++;
+
+ for (; i < rq_ring->ring_entries; i++) {
+ rqe = get_refill_entry(rq_ring);
+ rqe->off = ((uint64_t)1 << IORING_ZCRX_AREA_SHIFT) - 1;
+ }
+
+ io_uring_smp_store_release(rq_ring->ktail, rq_ring->rq_tail);
+}
+
+static int do_recv(struct io_uring *ring, struct recv_data *rd,
+ uint64_t refill_area_token)
+{
+ struct io_uring_cqe *cqe;
+ struct io_uring_zcrx_cqe *zcqe;
+ int i, ret;
+
+ refill_garbage(rd, refill_area_token);
+
+ for (i = 0; i < LOOP_COUNT - 1; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stdout, "wait_cqe: %d\n", ret);
+ return 1;
+ }
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "recv not supported, skipping\n");
+ goto out;
+ }
+ if (cqe->res < 0) {
+ fprintf(stderr, "failed recv cqe: %d\n", cqe->res);
+ goto err;
+ }
+ if (cqe->res - 1 != strlen(str)) {
+ fprintf(stderr, "got wrong length: %d/%d\n", cqe->res,
+ (int) strlen(str) + 1);
+ goto err;
+ }
+
+ zcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
+ uint64_t mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1;
+ uint64_t off = zcqe->off & mask;
+ void *addr = (char *)rd->area + off;
+ ret = strncmp(str, addr, sizeof(str));
+ if (ret != 0) {
+ fprintf(stderr, "recv incorrect payload: %s\n", (const char *)addr);
+ goto err;
+ }
+
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stdout, "wait_cqe: %d\n", ret);
+ return 1;
+ }
+ if (cqe->res != -ENOMEM) {
+ fprintf(stdout, "final recv cqe did not return ENOMEM\n");
+ goto err;
+ }
+
+out:
+ io_uring_cqe_seen(ring, cqe);
+ pthread_barrier_wait(&rd->finish);
+ return 0;
+err:
+ io_uring_cqe_seen(ring, cqe);
+ pthread_barrier_wait(&rd->finish);
+ return 1;
+}
+
+static void *recv_fn(void *data)
+{
+ struct recv_data *rd = data;
+ struct io_uring_params p = { };
+ struct io_uring ring;
+ int ret, sock;
+
+ p.flags = RING_FLAGS;
+ ret = t_create_ring_params(8, &ring, &p);
+ if (ret == T_SETUP_SKIP) {
+ ret = 0;
+ goto err;
+ } else if (ret < 0) {
+ goto err;
+ }
+
+ struct io_uring_zcrx_area_reg area_reg = {
+ .addr = (__u64)(unsigned long)rd->area,
+ .len = AREA_SZ,
+ .flags = 0,
+ };
+
+ struct io_uring_zcrx_ifq_reg reg = {
+ .if_idx = ifidx,
+ .if_rxq = rxq,
+ .rq_entries = RQ_ENTRIES,
+ .area_ptr = (__u64)(unsigned long)&area_reg,
+ };
+
+ ret = io_uring_register_ifq(&ring, ®);
+ if (ret != 0) {
+ fprintf(stderr, "register_ifq failed: %d\n", ret);
+ goto err_ring_exit;
+ }
+
+ /*
+ rd->ring_ptr = mmap(
+ 0,
+ reg.offsets.mmap_sz,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE,
+ ring.enter_ring_fd,
+ IORING_OFF_RQ_RING
+ );
+
+ rd->ring_sz = reg.offsets.mmap_sz;
+ */
+ rd->rq_ring.khead = (__u32*)((char*)rd->ring_ptr + reg.offsets.head);
+ rd->rq_ring.ktail = (__u32*)((char*)rd->ring_ptr + reg.offsets.tail);
+ rd->rq_ring.rqes = (struct io_uring_zcrx_rqe*)((char*)rd->ring_ptr + reg.offsets.rqes);
+ rd->rq_ring.rq_tail = 0;
+ rd->rq_ring.ring_entries = reg.rq_entries;
+
+ ret = recv_prep(&ring, rd, &sock);
+ if (ret) {
+ fprintf(stderr, "recv_prep failed: %d\n", ret);
+ goto err;
+ }
+ ret = do_recv(&ring, rd, area_reg.rq_area_token);
+
+ close(sock);
+ close(rd->accept_fd);
+err_ring_exit:
+ io_uring_queue_exit(&ring);
+err:
+ return (void *)(intptr_t)ret;
+}
+
+static int do_send(struct recv_data *rd)
+{
+ struct sockaddr_in saddr;
+ struct iovec iov = {
+ .iov_base = str,
+ .iov_len = sizeof(str),
+ };
+ struct io_uring ring;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ int i, sockfd, ret;
+
+ ret = io_uring_queue_init(8, &ring, 0);
+ if (ret) {
+ fprintf(stderr, "queue init failed: %d\n", ret);
+ return 1;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_port = htons(PORT);
+ inet_pton(AF_INET, HOST, &saddr.sin_addr);
+
+ sockfd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sockfd < 0) {
+ perror("socket");
+ goto err2;
+ }
+
+ pthread_barrier_wait(&rd->connect);
+
+ ret = connect(sockfd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ perror("connect");
+ goto err;
+ }
+
+ pthread_barrier_wait(&rd->startup);
+
+ for (i = 0; i < LOOP_COUNT; i++) {
+ sqe = io_uring_get_sqe(&ring);
+ io_uring_prep_send(sqe, sockfd, iov.iov_base, iov.iov_len, 0);
+ sqe->user_data = 1;
+ }
+
+ ret = io_uring_submit(&ring);
+ if (ret <= 0) {
+ fprintf(stderr, "submit failed: %d\n", ret);
+ goto err;
+ }
+
+ pthread_barrier_wait(&rd->barrier);
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (cqe->res == -EINVAL) {
+ fprintf(stdout, "send not supported, skipping\n");
+ goto err;
+ }
+ if (cqe->res != iov.iov_len) {
+ fprintf(stderr, "failed cqe: %d\n", cqe->res);
+ goto err;
+ }
+
+ pthread_barrier_wait(&rd->finish);
+
+ close(sockfd);
+ io_uring_queue_exit(&ring);
+ return 0;
+
+err:
+ close(sockfd);
+err2:
+ io_uring_queue_exit(&ring);
+ pthread_barrier_wait(&rd->finish);
+ return 1;
+}
+
+static int test_recv(void *area)
+{
+ pthread_t recv_thread;
+ struct recv_data rd;
+ int ret;
+ void *retval;
+
+ memset(&rd, 0, sizeof(rd));
+ pthread_barrier_init(&rd.connect, NULL, 2);
+ pthread_barrier_init(&rd.startup, NULL, 2);
+ pthread_barrier_init(&rd.barrier, NULL, 2);
+ pthread_barrier_init(&rd.finish, NULL, 2);
+ rd.area = area;
+
+ ret = pthread_create(&recv_thread, NULL, recv_fn, &rd);
+ if (ret) {
+ fprintf(stderr, "Thread create failed: %d\n", ret);
+ return 1;
+ }
+
+ do_send(&rd);
+ pthread_join(recv_thread, &retval);
+ return (intptr_t)retval;
+}
+
+int main(int argc, char *argv[])
+{
+ char *dev, *rxq_str, *rxq_end;
+ void *area_outer, *area;
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ area_outer = mmap(NULL, AREA_SZ + 8192, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
+ if (area_outer == MAP_FAILED) {
+ perror("mmap");
+ return T_EXIT_FAIL;
+ }
+
+ area = mmap(area_outer, AREA_SZ, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (area == MAP_FAILED) {
+ perror("mmap");
+ return T_EXIT_FAIL;
+ }
+
+ dev = getenv(DEV_ENV_VAR);
+ if (!dev)
+ return T_EXIT_SKIP;
+
+ ifidx = if_nametoindex(dev);
+ if (!ifidx)
+ return T_EXIT_SKIP;
+
+ rxq_str = getenv(RXQ_ENV_VAR);
+ if (!rxq_str)
+ return T_EXIT_SKIP;
+
+ rxq = strtol(rxq_str, &rxq_end, 10);
+ if (rxq_end == rxq_str || *rxq_end != '\0')
+ return T_EXIT_SKIP;
+
+ ret = probe_zcrx(area);
+ if (ret != T_EXIT_PASS)
+ return ret;
+
+ ret = test_rq_setup(area);
+ if (ret) {
+ fprintf(stderr, "test_invalid_reg_struct failed\n");
+ return ret;
+ }
+
+ ret = test_null_area_reg_struct();
+ if (ret) {
+ fprintf(stderr, "test_null_area_reg_struct failed\n");
+ return ret;
+ }
+
+ ret = test_null_area();
+ if (ret) {
+ fprintf(stderr, "test_null_area failed\n");
+ return ret;
+ }
+
+ ret = test_misaligned_area(area);
+ if (ret) {
+ fprintf(stderr, "test_misaligned_area failed\n");
+ return ret;
+ }
+
+ ret = test_larger_than_alloc_area(area);
+ if (ret) {
+ fprintf(stderr, "test_larger_than_alloc_area failed\n");
+ return ret;
+ }
+
+ ret = test_area_access();
+ if (ret) {
+ fprintf(stderr, "test_area_access failed\n");
+ return ret;
+ }
+
+ ret = test_invalid_if(area);
+ if (ret) {
+ fprintf(stderr, "test_invalid_if failed\n");
+ return ret;
+ }
+
+ ret = test_invalid_ifq_collision(area);
+ if (ret) {
+ fprintf(stderr, "test_invalid_ifq_collision failed\n");
+ return ret;
+ }
+
+ ret = test_invalid_invalid_request(area);
+ if (ret) {
+ fprintf(stderr, "test_invalid_ifq_collision failed\n");
+ return ret;
+ }
+
+ ret = test_recv(area);
+ if (ret) {
+ fprintf(stderr, "test_recv failed\n");
+ return ret;
+ }
+
+ munmap(area, AREA_SZ);
+ return 0;
+}
--
2.43.5
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH liburing v2 0/4] add basic zero copy receive support
2025-02-18 22:01 [PATCH liburing v2 0/4] add basic zero copy receive support David Wei
` (3 preceding siblings ...)
2025-02-18 22:01 ` [PATCH liburing v2 4/4] zcrx: add unit test David Wei
@ 2025-02-18 23:31 ` Jens Axboe
4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2025-02-18 23:31 UTC (permalink / raw)
To: io-uring, David Wei; +Cc: Pavel Begunkov
On Tue, 18 Feb 2025 14:01:32 -0800, David Wei wrote:
> Add basic support for io_uring zero copy receive in liburing. Besides
> the mandatory syncing of necessary liburing.h headers, add a thin
> wrapper around the registration op and a unit test.
>
> Users still need to setup by hand e.g. mmap, setup the registration
> structs, do the registration and then setup the refill queue struct
> io_uring_zcrx_rq.
>
> [...]
Applied, thanks!
[1/4] liburing: sync io_uring headers
commit: 48d8d54e524a9c37e0cc52921bb41070156a597f
[2/4] zcrx: sync io_uring headers
commit: ce3a65747d43a405cc19a630d5f8a0f613293f5c
[3/4] zcrx: add basic support
commit: d7ec4ce3421fbbdaba07426d589b72e204ac92e9
[4/4] zcrx: add unit test
commit: 30bd7baa5627f066fc0bfbca293f950fdf3ef3f1
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-02-18 23:31 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-18 22:01 [PATCH liburing v2 0/4] add basic zero copy receive support David Wei
2025-02-18 22:01 ` [PATCH liburing v2 1/4] liburing: sync io_uring headers David Wei
2025-02-18 22:01 ` [PATCH liburing v2 2/4] zcrx: " David Wei
2025-02-18 22:01 ` [PATCH liburing v2 3/4] zcrx: add basic support David Wei
2025-02-18 22:01 ` [PATCH liburing v2 4/4] zcrx: add unit test David Wei
2025-02-18 23:31 ` [PATCH liburing v2 0/4] add basic zero copy receive support Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox