From: Jonathan Lemon <[email protected]>
To: <[email protected]>
Cc: <[email protected]>
Subject: [RFC PATCH v3 12/15] io_uring: add OP_RECV_ZC command.
Date: Wed, 2 Nov 2022 16:32:41 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
This is still a WIP. The current code (temporarily) uses addr3
as a hack in order to leverage code in io_recvmsg_prep.
The recvzc opcode uses a metadata buffer either supplied directly
with buf/len, or indirectly from the buffer group. The expectation
is that this buffer is then filled with an array of io_uring_zctap_iov
structures, which point to the data in user-memory.
addr3 = (readlen << 32) | (copy_bgid << 16) | ctx->ifq_id;
The amount of returned data is limited by the number of iovs that
the metadata area can hold, and also the readlen parameter.
As a fallback (and for testing purposes), if the skb data is not
present in user memory (perhaps due to system misconfiguration), then
a seprate buffer is obtained from the copy_bgid and the data is
copied into user-memory.
Signed-off-by: Jonathan Lemon <[email protected]>
---
include/uapi/linux/io_uring.h | 1 +
io_uring/net.c | 121 ++++++++++++
io_uring/opdef.c | 15 ++
io_uring/zctap.c | 340 ++++++++++++++++++++++++++++++++++
io_uring/zctap.h | 20 ++
5 files changed, 497 insertions(+)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4d211d224c19..3d553c6662d1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -215,6 +215,7 @@ enum io_uring_op {
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC,
+ IORING_OP_RECV_ZC,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index 15dea91625e2..3a40e87afe54 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -16,6 +16,7 @@
#include "net.h"
#include "notif.h"
#include "rsrc.h"
+#include "zctap.h"
#if defined(CONFIG_NET)
struct io_shutdown {
@@ -67,6 +68,12 @@ struct io_sr_msg {
struct io_kiocb *notif;
};
+struct io_recvzc {
+ struct io_sr_msg sr;
+ u32 datalen;
+ u16 copy_bgid;
+};
+
#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -908,6 +915,120 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+ u64 recvzc_cmd;
+
+ /* XXX hack so we can temporarily use io_recvmsg_prep */
+ recvzc_cmd = READ_ONCE(sqe->addr3);
+
+ zc->copy_bgid = (recvzc_cmd >> 16) & 0xffff;
+ zc->datalen = recvzc_cmd >> 32;
+
+ return io_recvmsg_prep(req, sqe);
+}
+
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+ struct zctap_read_desc zrd;
+ struct msghdr msg;
+ struct socket *sock;
+ struct iovec iov;
+ unsigned int cflags;
+ unsigned flags;
+ int ret, min_ret = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ size_t len = zc->sr.len;
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (zc->sr.flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+retry_multishot:
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ zc->sr.buf = buf;
+ }
+
+ ret = import_single_range(READ, zc->sr.buf, len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ goto out_free;
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_get_inq = 1;
+ msg.msg_flags = 0;
+ msg.msg_controllen = 0;
+ msg.msg_iocb = NULL;
+ msg.msg_ubuf = NULL;
+
+ flags = zc->sr.msg_flags;
+ if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
+
+ zrd = (struct zctap_read_desc) {
+ .iov_limit = msg_data_left(&msg),
+ .recv_limit = zc->datalen,
+ .iter = &msg.msg_iter,
+ .ctx = req->ctx,
+ .copy_bgid = zc->copy_bgid,
+ };
+
+ ret = io_zctap_recv(sock, &zrd, &msg, flags);
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock) {
+ if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+ io_kbuf_recycle(req, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ return -EAGAIN;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ zc->sr.len -= ret;
+ zc->sr.buf += ret;
+ zc->sr.done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+out_free:
+ req_set_fail(req);
+ }
+
+ if (ret > 0)
+ ret += zc->sr.done_io;
+ else if (zc->sr.done_io)
+ ret = zc->sr.done_io;
+ else
+ io_kbuf_recycle(req, issue_flags);
+
+ cflags = io_put_kbuf(req, issue_flags);
+ if (msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+
+ if (!io_recv_finish(req, &ret, cflags, ret <= 0))
+ goto retry_multishot;
+
+ return ret;
+}
+
void io_send_zc_cleanup(struct io_kiocb *req)
{
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 83dc0f9ad3b2..14b42811a78e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
#include "poll.h"
#include "cancel.h"
#include "rw.h"
+#include "zctap.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -521,6 +522,20 @@ const struct io_op_def io_op_defs[] = {
.fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_RECV_ZC] = {
+ .name = "RECV_ZC",
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
+ .ioprio = 1,
+#if defined(CONFIG_NET)
+ .prep = io_recvzc_prep,
+ .issue = io_recvzc,
+#else
+ .prep = io_eopnotsupp_prep,
#endif
},
};
diff --git a/io_uring/zctap.c b/io_uring/zctap.c
index a84204c6eb96..32efa7e9199d 100644
--- a/io_uring/zctap.c
+++ b/io_uring/zctap.c
@@ -7,6 +7,7 @@
#include <linux/io_uring.h>
#include <linux/netdevice.h>
#include <linux/nospec.h>
+#include <net/tcp.h>
#include <uapi/linux/io_uring.h>
@@ -53,6 +54,11 @@ static u64 zctap_page_info(const struct page *page)
return page_private(page);
}
+static u16 zctap_page_region_id(const struct page *page)
+{
+ return (zctap_page_info(page) >> 16) & 0xffff;
+}
+
static u16 zctap_page_id(const struct page *page)
{
return zctap_page_info(page) & 0xffff;
@@ -72,6 +78,14 @@ static bool zctap_page_ours(struct page *page)
#define IO_ZCTAP_UREF 0x10000
#define IO_ZCTAP_KREF_MASK (IO_ZCTAP_UREF - 1)
+static void io_zctap_get_buf_uref(struct ifq_region *ifr, u16 pgid)
+{
+ if (WARN_ON(pgid >= ifr->nr_pages))
+ return;
+
+ atomic_add(IO_ZCTAP_UREF, &ifr->buf[pgid].refcount);
+}
+
/* return user refs back, indicate whether buffer is reusable */
static bool io_zctap_put_buf_uref(struct io_zctap_buf *buf)
{
@@ -392,6 +406,18 @@ static void io_zctap_ifq_callback(struct sk_buff *skb, struct ubuf_info *uarg,
}
}
+static struct io_zctap_ifq *io_zctap_skb_ifq(struct sk_buff *skb)
+{
+ struct io_zctap_ifq_priv *priv;
+ struct ubuf_info *uarg = skb_zcopy(skb);
+
+ if (uarg && uarg->callback == io_zctap_ifq_callback) {
+ priv = container_of(uarg, struct io_zctap_ifq_priv, uarg);
+ return &priv->ifq;
+ }
+ return NULL;
+}
+
static struct io_zctap_ifq *io_zctap_ifq_alloc(struct io_ring_ctx *ctx)
{
struct io_zctap_ifq_priv *priv;
@@ -485,3 +511,317 @@ void io_unregister_zctap_all(struct io_ring_ctx *ctx)
for (i = 0; i < NR_ZCTAP_IFQS; i++)
io_unregister_zctap_ifq(ctx, i);
}
+
+static int __zctap_get_user_buffer(struct zctap_read_desc *zrd, int len)
+{
+ if (!zrd->buflen) {
+ zrd->req = (struct io_kiocb) {
+ .ctx = zrd->ctx,
+ .buf_index = zrd->copy_bgid,
+ };
+
+ zrd->buf = (u8 *)io_zctap_buffer(&zrd->req, &zrd->buflen);
+ zrd->offset = 0;
+ }
+ return len > zrd->buflen ? zrd->buflen : len;
+}
+
+static int zctap_copy_data(struct zctap_read_desc *zrd, int len, u8 *kaddr)
+{
+ struct io_uring_zctap_iov zov;
+ u32 space;
+ int err;
+
+ space = zrd->iov_space + sizeof(zov);
+ if (space > zrd->iov_limit)
+ return 0;
+
+ len = __zctap_get_user_buffer(zrd, len);
+ if (!len)
+ return -ENOBUFS;
+
+ err = copy_to_user(zrd->buf + zrd->offset, kaddr, len);
+ if (err)
+ return -EFAULT;
+
+ zov = (struct io_uring_zctap_iov) {
+ .off = zrd->offset,
+ .len = len,
+ .bgid = zrd->copy_bgid,
+ .bid = zrd->req.buf_index,
+ };
+
+ if (copy_to_iter(&zov, sizeof(zov), zrd->iter) != sizeof(zov))
+ return -EFAULT;
+
+ zrd->offset += len;
+ zrd->buflen -= len;
+ zrd->iov_space = space;
+
+ return len;
+}
+
+static int zctap_copy_frag(struct zctap_read_desc *zrd, struct page *page,
+ int off, int len, struct io_uring_zctap_iov *zov)
+{
+ u8 *kaddr;
+ int err;
+
+ len = __zctap_get_user_buffer(zrd, len);
+ if (!len)
+ return -ENOBUFS;
+
+ kaddr = kmap(page) + off;
+ err = copy_to_user(zrd->buf + zrd->offset, kaddr, len);
+ kunmap(page);
+
+ if (err)
+ return -EFAULT;
+
+ *zov = (struct io_uring_zctap_iov) {
+ .off = zrd->offset,
+ .len = len,
+ .bgid = zrd->copy_bgid,
+ .bid = zrd->req.buf_index,
+ };
+
+ zrd->offset += len;
+ zrd->buflen -= len;
+
+ return len;
+}
+
+static int zctap_recv_frag(struct zctap_read_desc *zrd,
+ struct io_zctap_ifq *ifq,
+ const skb_frag_t *frag, int off, int len)
+{
+ struct io_uring_zctap_iov zov;
+ struct page *page;
+ u32 space;
+ int pgid;
+
+ space = zrd->iov_space + sizeof(zov);
+ if (space > zrd->iov_limit)
+ return 0;
+
+ page = skb_frag_page(frag);
+ off += skb_frag_off(frag);
+
+ if (likely(ifq && ifq->ctx == zrd->ctx && zctap_page_ours(page))) {
+ pgid = zctap_page_id(page);
+ io_zctap_get_buf_uref(ifq->region, pgid);
+ zov = (struct io_uring_zctap_iov) {
+ .off = off,
+ .len = len,
+ .bgid = zctap_page_region_id(page),
+ .bid = pgid,
+ };
+ } else {
+ len = zctap_copy_frag(zrd, page, off, len, &zov);
+ if (len <= 0)
+ return len;
+ }
+
+ if (copy_to_iter(&zov, sizeof(zov), zrd->iter) != sizeof(zov))
+ return -EFAULT;
+
+ zrd->iov_space = space;
+
+ return len;
+}
+
+/* Our version of __skb_datagram_iter -- should work for UDP also. */
+static int
+zctap_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct zctap_read_desc *zrd = desc->arg.data;
+ struct io_zctap_ifq *ifq;
+ unsigned start, start_off;
+ struct sk_buff *frag_iter;
+ int i, copy, end, off;
+ int ret = 0;
+
+ if (zrd->iov_space >= zrd->iov_limit) {
+ desc->count = 0;
+ return 0;
+ }
+ if (len > zrd->recv_limit)
+ len = zrd->recv_limit;
+
+ start = skb_headlen(skb);
+ start_off = offset;
+
+ ifq = io_zctap_skb_ifq(skb);
+
+ if (offset < start) {
+ copy = start - offset;
+ if (copy > len)
+ copy = len;
+
+ /* copy out linear data */
+ ret = zctap_copy_data(zrd, copy, skb->data + offset);
+ if (ret < 0)
+ goto out;
+ offset += ret;
+ len -= ret;
+ if (len == 0 || ret != copy)
+ goto out;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ const skb_frag_t *frag;
+
+ WARN_ON(start > offset + len);
+
+ frag = &skb_shinfo(skb)->frags[i];
+ end = start + skb_frag_size(frag);
+
+ if (offset < end) {
+ copy = end - offset;
+ if (copy > len)
+ copy = len;
+
+ off = offset - start;
+ ret = zctap_recv_frag(zrd, ifq, frag, off, copy);
+ if (ret < 0)
+ goto out;
+
+ offset += ret;
+ len -= ret;
+ if (len == 0 || ret != copy)
+ goto out;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if (offset < end) {
+ copy = end - offset;
+ if (copy > len)
+ copy = len;
+
+ off = offset - start;
+ ret = zctap_recv_skb(desc, frag_iter, off, copy);
+ if (ret < 0)
+ goto out;
+
+ offset += ret;
+ len -= ret;
+ if (len == 0 || ret != copy)
+ goto out;
+ }
+ start = end;
+ }
+
+out:
+ if (offset == start_off)
+ return ret;
+ return offset - start_off;
+}
+
+static int __io_zctap_tcp_read(struct sock *sk, struct zctap_read_desc *zrd)
+{
+ read_descriptor_t rd_desc = {
+ .arg.data = zrd,
+ .count = 1,
+ };
+
+ return tcp_read_sock(sk, &rd_desc, zctap_recv_skb);
+}
+
+static int io_zctap_tcp_recvmsg(struct sock *sk, struct zctap_read_desc *zrd,
+ int flags, int *addr_len)
+{
+ size_t used;
+ long timeo;
+ int ret;
+
+ ret = used = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ while (zrd->recv_limit) {
+ ret = __io_zctap_tcp_read(sk, zrd);
+ if (ret < 0)
+ break;
+ if (!ret) {
+ if (used)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ sk_wait_data(sk, &timeo, NULL);
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ zrd->recv_limit -= ret;
+ used += ret;
+
+ if (!timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ /* XXX, handle timestamping */
+
+ if (used)
+ return used;
+
+ return ret;
+}
+
+int io_zctap_recv(struct socket *sock, struct zctap_read_desc *zrd,
+ struct msghdr *msg, unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+ int addr_len = 0;
+ int ret;
+
+ if (flags & MSG_ERRQUEUE)
+ return -EOPNOTSUPP;
+
+ prot = READ_ONCE(sk->sk_prot);
+ if (prot->recvmsg != tcp_recvmsg)
+ return -EPROTONOSUPPORT;
+
+ sock_rps_record_flow(sk);
+
+ ret = io_zctap_tcp_recvmsg(sk, zrd, flags, &addr_len);
+ if (ret >= 0) {
+ msg->msg_namelen = addr_len;
+ ret = zrd->iov_space;
+ }
+ return ret;
+}
diff --git a/io_uring/zctap.h b/io_uring/zctap.h
index bb44f8e972e8..4db516707d19 100644
--- a/io_uring/zctap.h
+++ b/io_uring/zctap.h
@@ -2,10 +2,30 @@
#ifndef IOU_ZCTAP_H
#define IOU_ZCTAP_H
+struct zctap_read_desc {
+ struct io_ring_ctx *ctx;
+ struct iov_iter *iter;
+ u32 iov_space;
+ u32 iov_limit;
+ u32 recv_limit;
+
+ struct io_kiocb req;
+ u8 *buf;
+ size_t offset;
+ size_t buflen;
+
+ u16 copy_bgid; /* XXX move to register ifq? */
+};
+
int io_register_ifq(struct io_ring_ctx *ctx,
struct io_uring_ifq_req __user *arg);
void io_unregister_zctap_all(struct io_ring_ctx *ctx);
int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id);
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_zctap_recv(struct socket *sock, struct zctap_read_desc *zrd,
+ struct msghdr *msg, unsigned int flags);
+
#endif
--
2.30.2
next prev parent reply other threads:[~2022-11-02 23:40 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-02 23:32 [RFC PATCH v3 00/15] zero-copy RX for io_uring Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 01/15] io_uring: add zctap ifq definition Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 02/15] netdevice: add SETUP_ZCTAP to the netdev_bpf structure Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 03/15] io_uring: add register ifq opcode Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 04/15] io_uring: create a zctap region for a mapped buffer Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 05/15] io_uring: mark pages in ifq region with zctap information Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 06/15] io_uring: Provide driver API for zctap packet buffers Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 07/15] io_uring: Allocate zctap device buffers and dma map them Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 08/15] io_uring: Add zctap buffer get/put functions and refcounting Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 09/15] skbuff: Introduce SKBFL_FIXED_FRAG and skb_fixed() Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 10/15] io_uring: Allocate a uarg for use by the ifq RX Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 11/15] io_uring: Define the zctap iov[] returned to the user Jonathan Lemon
2022-11-02 23:32 ` Jonathan Lemon [this message]
2022-11-02 23:32 ` [RFC PATCH v3 13/15] io_uring: Make remove_ifq_region a delayed work call Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 14/15] io_uring: Add a buffer caching mechanism for zctap Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 15/15] io_uring: Notify the application as the fillq is drained Jonathan Lemon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox