From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>,
Willem de Bruijn <[email protected]>,
Dragos Tatulea <[email protected]>
Subject: [PATCH 20/20] io_uring/zcrx: add multi socket support per Rx queue
Date: Tue, 7 Nov 2023 13:40:45 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
Extract the io_uring internal sock_idx from a sock and set it in each
rbuf cqe. This allows userspace to distinguish which cqe belongs to
which socket (and by association, which flow).
This complicates the uapi as userspace now needs to keep a table of
sock_idx to bufs per loop iteration. Each io_recvzc request on a socket
will return its own completion event, but all rbuf cqes from all sockets
already exist in the rbuf cq ring.
Co-developed-by: Pavel Begunkov <[email protected]>
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
include/uapi/linux/io_uring.h | 3 ++-
io_uring/net.c | 1 +
io_uring/zc_rx.c | 29 ++++++++++++++++++++++-------
3 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 603d07d0a791..588fd7eda797 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -754,8 +754,9 @@ struct io_uring_rbuf_cqe {
__u32 off;
__u32 len;
__u16 region;
+ __u8 sock;
__u8 flags;
- __u8 __pad[3];
+ __u8 __pad[2];
};
struct io_rbuf_rqring_offsets {
diff --git a/io_uring/net.c b/io_uring/net.c
index e7b41c5826d5..4f8d19e88dcb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1031,6 +1031,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_zc_rx_ifq *ifq;
+ unsigned sock_idx;
if (issue_flags & IO_URING_F_UNLOCKED)
return -EAGAIN;
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 611a068c3402..fdeaed4b4883 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -47,6 +47,11 @@ struct io_zc_refill_data {
unsigned count;
};
+struct io_zc_rx_recv_args {
+ struct io_zc_rx_ifq *ifq;
+ struct socket *sock;
+};
+
static inline u32 io_zc_rx_cqring_entries(struct io_zc_rx_ifq *ifq)
{
struct io_rbuf_ring *ring = ifq->ring;
@@ -667,7 +672,7 @@ static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *
}
static ssize_t zc_rx_copy_chunk(struct io_zc_rx_ifq *ifq, void *data,
- unsigned int offset, size_t len)
+ unsigned int offset, size_t len, unsigned sock_idx)
{
size_t copy_size, copied = 0;
struct io_uring_rbuf_cqe *cqe;
@@ -702,6 +707,7 @@ static ssize_t zc_rx_copy_chunk(struct io_zc_rx_ifq *ifq, void *data,
cqe->off = pgid * PAGE_SIZE + off;
cqe->len = copy_size;
cqe->flags = 0;
+ cqe->sock = sock_idx;
offset += copy_size;
len -= copy_size;
@@ -712,7 +718,7 @@ static ssize_t zc_rx_copy_chunk(struct io_zc_rx_ifq *ifq, void *data,
}
static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
- int off, int len, bool zc_skb)
+ int off, int len, unsigned sock_idx, bool zc_skb)
{
struct io_uring_rbuf_cqe *cqe;
struct page *page;
@@ -732,6 +738,7 @@ static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
cqe->region = 0;
cqe->off = pgid * PAGE_SIZE + off;
cqe->len = len;
+ cqe->sock = sock_idx;
cqe->flags = 0;
} else {
u32 p_off, p_len, t, copied = 0;
@@ -741,7 +748,7 @@ static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
skb_frag_foreach_page(frag, off, len,
page, p_off, p_len, t) {
vaddr = kmap_local_page(page);
- ret = zc_rx_copy_chunk(ifq, vaddr, p_off, p_len);
+ ret = zc_rx_copy_chunk(ifq, vaddr, p_off, p_len, sock_idx);
kunmap_local(vaddr);
if (ret < 0)
@@ -758,9 +765,12 @@ static int
zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
- struct io_zc_rx_ifq *ifq = desc->arg.data;
+ struct io_zc_rx_recv_args *args = desc->arg.data;
+ struct io_zc_rx_ifq *ifq = args->ifq;
+ struct socket *sock = args->sock;
struct io_zc_rx_ifq *skb_ifq;
struct sk_buff *frag_iter;
+ unsigned sock_idx = sock->zc_rx_idx & IO_ZC_IFQ_IDX_MASK;
unsigned start, start_off = offset;
int i, copy, end, off;
bool zc_skb = true;
@@ -778,7 +788,7 @@ zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
size_t to_copy;
to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
- copied = zc_rx_copy_chunk(ifq, skb->data, offset, to_copy);
+ copied = zc_rx_copy_chunk(ifq, skb->data, offset, to_copy, sock_idx);
if (copied < 0) {
ret = copied;
goto out;
@@ -807,7 +817,7 @@ zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
copy = len;
off = offset - start;
- ret = zc_rx_recv_frag(ifq, frag, off, copy, zc_skb);
+ ret = zc_rx_recv_frag(ifq, frag, off, copy, sock_idx, zc_skb);
if (ret < 0)
goto out;
@@ -850,9 +860,14 @@ zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
static int io_zc_rx_tcp_read(struct io_zc_rx_ifq *ifq, struct sock *sk)
{
+ struct io_zc_rx_recv_args args = {
+ .ifq = ifq,
+ .sock = sk->sk_socket,
+ };
+
read_descriptor_t rd_desc = {
.count = 1,
- .arg.data = ifq,
+ .arg.data = &args,
};
return tcp_read_sock(sk, &rd_desc, zc_rx_recv_skb);
--
2.39.3
prev parent reply other threads:[~2023-11-07 21:41 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-11-07 21:40 [RFC PATCH v2 00/20] Zero copy Rx using io_uring David Wei
2023-11-07 21:40 ` [PATCH 01/20] io_uring: add interface queue David Wei
2023-11-07 21:40 ` [PATCH 02/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-11-07 21:40 ` [PATCH 03/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-11-07 21:40 ` [PATCH 04/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-11-07 21:40 ` [PATCH 05/20] io_uring/zcrx: implement socket registration David Wei
2023-11-07 21:40 ` [PATCH 06/20] io_uring: add ZC buf and pool David Wei
2023-11-07 21:40 ` [PATCH 07/20] io_uring: add ZC pool API David Wei
2023-11-07 21:40 ` [PATCH 08/20] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() David Wei
2023-11-07 21:40 ` [PATCH 09/20] io_uring: allocate a uarg for freeing zero copy skbs David Wei
2023-11-07 21:40 ` [PATCH 10/20] io_uring: delay ZC pool destruction David Wei
2023-11-07 21:40 ` [PATCH 11/20] net: add data pool David Wei
2023-11-07 21:40 ` [PATCH 12/20] io_uring: add io_recvzc request David Wei
2023-11-07 21:40 ` [PATCH 13/20] io_uring/zcrx: propagate ifq down the stack David Wei
2023-11-07 21:40 ` [PATCH 14/20] io_uring/zcrx: introduce io_zc_get_rbuf_cqe David Wei
2023-11-07 21:40 ` [PATCH 15/20] io_uring/zcrx: add copy fallback David Wei
2023-11-07 21:40 ` [PATCH 16/20] net: execute custom callback from napi David Wei
2023-11-07 21:40 ` [PATCH 17/20] io_uring/zcrx: copy fallback to ring buffers David Wei
2023-11-07 21:40 ` [PATCH 18/20] veth: add support for io_uring zc rx David Wei
2023-11-07 21:40 ` [PATCH 19/20] bnxt: use data pool David Wei
2023-11-07 21:40 ` David Wei [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox