From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
"David S. Miller" <[email protected]>,
Eric Dumazet <[email protected]>,
Jesper Dangaard Brouer <[email protected]>,
David Ahern <[email protected]>,
Mina Almasry <[email protected]>
Subject: [RFC PATCH v3 15/20] io_uring: add io_recvzc request
Date: Tue, 19 Dec 2023 13:03:52 -0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: David Wei <[email protected]>
This patch adds an io_uring opcode OP_RECV_ZC for doing ZC reads from a
socket that is set up for ZC Rx. The request reads skbs from a socket
where its page frags are tagged w/ a magic cookie in their page private
field. For each frag, entries are written into the ifq rbuf completion
ring, and the total number of bytes read is returned to user as an
io_uring completion event.
Multishot requests work. There is no need to specify provided buffers as
data is returned in the ifq rbuf completion rings.
Userspace is expected to look into the ifq rbuf completion ring when it
receives an io_uring completion event.
The addr3 field is used to encode params in the following format:
addr3 = (readlen << 32);
readlen is the max amount of data to read from the socket. ifq_id is the
interface queue id, and currently only 0 is supported.
Signed-off-by: David Wei <[email protected]>
---
include/uapi/linux/io_uring.h | 1 +
io_uring/net.c | 119 ++++++++++++++++-
io_uring/opdef.c | 16 +++
io_uring/zc_rx.c | 240 +++++++++++++++++++++++++++++++++-
io_uring/zc_rx.h | 5 +
5 files changed, 375 insertions(+), 6 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f4ba58bce3bd..f57f394744fe 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -253,6 +253,7 @@ enum io_uring_op {
IORING_OP_FUTEX_WAIT,
IORING_OP_FUTEX_WAKE,
IORING_OP_FUTEX_WAITV,
+ IORING_OP_RECV_ZC,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index 454ba301ae6b..7a2aadf6962c 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -71,6 +71,16 @@ struct io_sr_msg {
struct io_kiocb *notif;
};
+struct io_recvzc {
+ struct file *file;
+ unsigned len;
+ unsigned done_io;
+ unsigned msg_flags;
+ u16 flags;
+
+ u32 datalen;
+};
+
static inline bool io_check_multishot(struct io_kiocb *req,
unsigned int issue_flags)
{
@@ -637,7 +647,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
unsigned int cflags;
cflags = io_put_kbuf(req, issue_flags);
- if (msg->msg_inq && msg->msg_inq != -1)
+ if (msg && msg->msg_inq && msg->msg_inq != -1)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
@@ -652,7 +662,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
io_recv_prep_retry(req);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
- msg->msg_inq == -1)
+ (msg && msg->msg_inq == -1))
return false;
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_ISSUE_SKIP_COMPLETE;
@@ -956,9 +966,8 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
-static __maybe_unused
-struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
- struct socket *sock)
+static struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
+ struct socket *sock)
{
unsigned token = READ_ONCE(sock->zc_rx_idx);
unsigned ifq_idx = token >> IO_ZC_IFQ_IDX_OFFSET;
@@ -975,6 +984,106 @@ struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
return ifq;
}
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+ u64 recvzc_cmd;
+
+ recvzc_cmd = READ_ONCE(sqe->addr3);
+ zc->datalen = recvzc_cmd >> 32;
+ if (recvzc_cmd & 0xffff)
+ return -EINVAL;
+ if (!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ return -EINVAL;
+ if (unlikely(sqe->file_index || sqe->addr2))
+ return -EINVAL;
+
+ zc->len = READ_ONCE(sqe->len);
+ zc->flags = READ_ONCE(sqe->ioprio);
+ if (zc->flags & ~(RECVMSG_FLAGS))
+ return -EINVAL;
+ zc->msg_flags = READ_ONCE(sqe->msg_flags);
+ if (zc->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ if (zc->msg_flags & MSG_ERRQUEUE)
+ req->flags |= REQ_F_CLEAR_POLLIN;
+ if (zc->flags & IORING_RECV_MULTISHOT) {
+ if (zc->msg_flags & MSG_WAITALL)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_RECV && zc->len)
+ return -EINVAL;
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ zc->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+ zc->done_io = 0;
+ return 0;
+}
+
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+ struct socket *sock;
+ unsigned flags;
+ int ret, min_ret = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ struct io_zc_rx_ifq *ifq;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ return -EAGAIN;
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (zc->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+ ifq = io_zc_verify_sock(req, sock);
+ if (!ifq)
+ return -EINVAL;
+
+retry_multishot:
+ flags = zc->msg_flags;
+ if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = zc->len;
+
+ ret = io_zc_rx_recv(ifq, sock, zc->datalen, flags);
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock) {
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_ISSUE_SKIP_COMPLETE;
+ return -EAGAIN;
+ }
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ zc->len -= ret;
+ zc->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ req_set_fail(req);
+ }
+
+ if (ret > 0)
+ ret += zc->done_io;
+ else if (zc->done_io)
+ ret = zc->done_io;
+
+ if (!io_recv_finish(req, &ret, 0, ret <= 0, issue_flags))
+ goto retry_multishot;
+
+ return ret;
+}
+
void io_send_zc_cleanup(struct io_kiocb *req)
{
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 799db44283c7..a90231566d09 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -35,6 +35,7 @@
#include "rw.h"
#include "waitid.h"
#include "futex.h"
+#include "zc_rx.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -467,6 +468,18 @@ const struct io_issue_def io_issue_defs[] = {
.issue = io_futexv_wait,
#else
.prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_RECV_ZC] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .ioprio = 1,
+#if defined(CONFIG_NET)
+ .prep = io_recvzc_prep,
+ .issue = io_recvzc,
+#else
+ .prep = io_eopnotsupp_prep,
#endif
},
};
@@ -704,6 +717,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FUTEX_WAITV] = {
.name = "FUTEX_WAITV",
},
+ [IORING_OP_RECV_ZC] = {
+ .name = "RECV_ZC",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index ff1dac24ac40..acb70ca23150 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -6,6 +6,7 @@
#include <linux/io_uring.h>
#include <linux/netdevice.h>
#include <linux/nospec.h>
+#include <net/tcp.h>
#include <trace/events/page_pool.h>
#include <uapi/linux/io_uring.h>
@@ -15,8 +16,20 @@
#include "zc_rx.h"
#include "rsrc.h"
+struct io_zc_rx_args {
+ struct io_zc_rx_ifq *ifq;
+ struct socket *sock;
+};
+
typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
+static inline u32 io_zc_rx_cqring_entries(struct io_zc_rx_ifq *ifq)
+{
+ struct io_rbuf_ring *ring = ifq->ring;
+
+ return ifq->cached_cq_tail - READ_ONCE(ring->cq.head);
+}
+
static inline struct device *netdev2dev(struct net_device *dev)
{
return dev->dev.parent;
@@ -399,7 +412,7 @@ static inline unsigned io_buf_pgid(struct io_zc_rx_pool *pool,
return buf - pool->bufs;
}
-static __maybe_unused void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
+static void io_zc_rx_get_buf_uref(struct io_zc_rx_buf *buf)
{
refcount_add(IO_ZC_RX_UREF, &buf->ppiov.refcount);
}
@@ -590,5 +603,230 @@ const struct pp_memory_provider_ops io_uring_pp_zc_ops = {
};
EXPORT_SYMBOL(io_uring_pp_zc_ops);
+static inline struct io_uring_rbuf_cqe *io_zc_get_rbuf_cqe(struct io_zc_rx_ifq *ifq)
+{
+ struct io_uring_rbuf_cqe *cqe;
+ unsigned int cq_idx, queued, free, entries;
+ unsigned int mask = ifq->cq_entries - 1;
+
+ cq_idx = ifq->cached_cq_tail & mask;
+ smp_rmb();
+ queued = min(io_zc_rx_cqring_entries(ifq), ifq->cq_entries);
+ free = ifq->cq_entries - queued;
+ entries = min(free, ifq->cq_entries - cq_idx);
+ if (!entries)
+ return NULL;
+
+ cqe = &ifq->cqes[cq_idx];
+ ifq->cached_cq_tail++;
+ return cqe;
+}
+
+static int zc_rx_recv_frag(struct io_zc_rx_ifq *ifq, const skb_frag_t *frag,
+ int off, int len, unsigned sock_idx)
+{
+ off += skb_frag_off(frag);
+
+ if (likely(page_is_page_pool_iov(frag->bv_page))) {
+ struct io_uring_rbuf_cqe *cqe;
+ struct io_zc_rx_buf *buf;
+ struct page_pool_iov *ppiov;
+
+ ppiov = page_to_page_pool_iov(frag->bv_page);
+ if (ppiov->pp->p.memory_provider != PP_MP_IOU_ZCRX ||
+ ppiov->pp->mp_priv != ifq)
+ return -EFAULT;
+
+ cqe = io_zc_get_rbuf_cqe(ifq);
+ if (!cqe)
+ return -ENOBUFS;
+
+ buf = io_iov_to_buf(ppiov);
+ io_zc_rx_get_buf_uref(buf);
+
+ cqe->region = 0;
+ cqe->off = io_buf_pgid(ifq->pool, buf) * PAGE_SIZE + off;
+ cqe->len = len;
+ cqe->sock = sock_idx;
+ cqe->flags = 0;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ return len;
+}
+
+static int
+zc_rx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct io_zc_rx_args *args = desc->arg.data;
+ struct io_zc_rx_ifq *ifq = args->ifq;
+ struct socket *sock = args->sock;
+ unsigned sock_idx = sock->zc_rx_idx & IO_ZC_IFQ_IDX_MASK;
+ struct sk_buff *frag_iter;
+ unsigned start, start_off;
+ int i, copy, end, off;
+ int ret = 0;
+
+ start = skb_headlen(skb);
+ start_off = offset;
+
+ if (offset < start)
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ const skb_frag_t *frag;
+
+ WARN_ON(start > offset + len);
+
+ frag = &skb_shinfo(skb)->frags[i];
+ end = start + skb_frag_size(frag);
+
+ if (offset < end) {
+ copy = end - offset;
+ if (copy > len)
+ copy = len;
+
+ off = offset - start;
+ ret = zc_rx_recv_frag(ifq, frag, off, copy, sock_idx);
+ if (ret < 0)
+ goto out;
+
+ offset += ret;
+ len -= ret;
+ if (len == 0 || ret != copy)
+ goto out;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if (offset < end) {
+ copy = end - offset;
+ if (copy > len)
+ copy = len;
+
+ off = offset - start;
+ ret = zc_rx_recv_skb(desc, frag_iter, off, copy);
+ if (ret < 0)
+ goto out;
+
+ offset += ret;
+ len -= ret;
+ if (len == 0 || ret != copy)
+ goto out;
+ }
+ start = end;
+ }
+
+out:
+ smp_store_release(&ifq->ring->cq.tail, ifq->cached_cq_tail);
+ if (offset == start_off)
+ return ret;
+ return offset - start_off;
+}
+
+static int io_zc_rx_tcp_read(struct io_zc_rx_ifq *ifq, struct sock *sk)
+{
+ struct io_zc_rx_args args = {
+ .ifq = ifq,
+ .sock = sk->sk_socket,
+ };
+ read_descriptor_t rd_desc = {
+ .count = 1,
+ .arg.data = &args,
+ };
+
+ return tcp_read_sock(sk, &rd_desc, zc_rx_recv_skb);
+}
+
+static int io_zc_rx_tcp_recvmsg(struct io_zc_rx_ifq *ifq, struct sock *sk,
+ unsigned int recv_limit,
+ int flags, int *addr_len)
+{
+ size_t used;
+ long timeo;
+ int ret;
+
+ ret = used = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ while (recv_limit) {
+ ret = io_zc_rx_tcp_read(ifq, sk);
+ if (ret < 0)
+ break;
+ if (!ret) {
+ if (used)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ sk_wait_data(sk, &timeo, NULL);
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ recv_limit -= ret;
+ used += ret;
+
+ if (!timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+ release_sock(sk);
+ /* TODO: handle timestamping */
+ return used ? used : ret;
+}
+
+int io_zc_rx_recv(struct io_zc_rx_ifq *ifq, struct socket *sock,
+ unsigned int limit, unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+ int addr_len = 0;
+ int ret;
+
+ if (flags & MSG_ERRQUEUE)
+ return -EOPNOTSUPP;
+
+ prot = READ_ONCE(sk->sk_prot);
+ if (prot->recvmsg != tcp_recvmsg)
+ return -EPROTONOSUPPORT;
+
+ sock_rps_record_flow(sk);
+
+ ret = io_zc_rx_tcp_recvmsg(ifq, sk, limit, flags, &addr_len);
+
+ return ret;
+}
#endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 00d864700c67..3e8f07e4b252 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -72,4 +72,9 @@ static inline int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
}
#endif
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_zc_rx_recv(struct io_zc_rx_ifq *ifq, struct socket *sock,
+ unsigned int limit, unsigned int flags);
+
#endif
--
2.39.3
next prev parent reply other threads:[~2023-12-19 21:04 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-19 21:03 [RFC PATCH v3 00/20] Zero copy Rx using io_uring David Wei
2023-12-19 21:03 ` [RFC PATCH v3 01/20] net: page_pool: add ppiov mangling helper David Wei
2023-12-19 23:22 ` Mina Almasry
2023-12-19 23:59 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 02/20] tcp: don't allow non-devmem originated ppiov David Wei
2023-12-19 23:24 ` Mina Almasry
2023-12-20 1:29 ` Pavel Begunkov
2024-01-02 16:11 ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 03/20] net: page pool: rework ppiov life cycle David Wei
2023-12-19 23:35 ` Mina Almasry
2023-12-20 0:49 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 04/20] net: enable napi_pp_put_page for ppiov David Wei
2023-12-19 21:03 ` [RFC PATCH v3 05/20] net: page_pool: add ->scrub mem provider callback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 06/20] io_uring: separate header for exported net bits David Wei
2023-12-20 16:01 ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 07/20] io_uring: add interface queue David Wei
2023-12-20 16:13 ` Jens Axboe
2023-12-20 16:23 ` Pavel Begunkov
2023-12-21 1:44 ` David Wei
2023-12-21 17:57 ` Willem de Bruijn
2023-12-30 16:25 ` Pavel Begunkov
2023-12-31 22:25 ` Willem de Bruijn
2023-12-19 21:03 ` [RFC PATCH v3 08/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-12-20 16:13 ` Jens Axboe
2023-12-19 21:03 ` [RFC PATCH v3 09/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-12-19 21:03 ` [RFC PATCH v3 10/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-12-20 16:06 ` Jens Axboe
2023-12-20 16:24 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 11/20] io_uring/zcrx: implement socket registration David Wei
2023-12-19 21:03 ` [RFC PATCH v3 12/20] io_uring: add ZC buf and pool David Wei
2023-12-19 21:03 ` [RFC PATCH v3 13/20] io_uring: implement pp memory provider for zc rx David Wei
2023-12-19 23:44 ` Mina Almasry
2023-12-20 0:39 ` Pavel Begunkov
2023-12-21 19:36 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 14/20] net: page pool: add io_uring memory provider David Wei
2023-12-19 23:39 ` Mina Almasry
2023-12-20 0:04 ` Pavel Begunkov
2023-12-19 21:03 ` David Wei [this message]
2023-12-20 16:27 ` [RFC PATCH v3 15/20] io_uring: add io_recvzc request Jens Axboe
2023-12-20 17:04 ` Pavel Begunkov
2023-12-20 18:09 ` Jens Axboe
2023-12-21 18:59 ` Pavel Begunkov
2023-12-21 21:32 ` Jens Axboe
2023-12-30 21:15 ` Pavel Begunkov
2023-12-19 21:03 ` [RFC PATCH v3 16/20] net: execute custom callback from napi David Wei
2023-12-19 21:03 ` [RFC PATCH v3 17/20] io_uring/zcrx: add copy fallback David Wei
2023-12-19 21:03 ` [RFC PATCH v3 18/20] veth: add support for io_uring zc rx David Wei
2023-12-19 21:03 ` [RFC PATCH v3 19/20] net: page pool: generalise ppiov dma address get David Wei
2023-12-21 19:51 ` Mina Almasry
2023-12-19 21:03 ` [RFC PATCH v3 20/20] bnxt: enable io_uring zc page pool David Wei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox