public inbox for [email protected]
 help / color / mirror / Atom feed
From: Jonathan Lemon <[email protected]>
To: <[email protected]>
Cc: <[email protected]>
Subject: [RFC PATCH v3 12/15] io_uring: add OP_RECV_ZC command.
Date: Wed, 2 Nov 2022 16:32:41 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

This is still a WIP.  The current code (temporarily) uses addr3
as a hack in order to leverage code in io_recvmsg_prep.

The recvzc opcode uses a metadata buffer either supplied directly
with buf/len, or indirectly from the buffer group.  The expectation
is that this buffer is then filled with an array of io_uring_zctap_iov
structures, which point to the data in user-memory.

    addr3 = (readlen << 32) | (copy_bgid << 16) | ctx->ifq_id;

The amount of returned data is limited by the number of iovs that
the metadata area can hold, and also the readlen parameter.

As a fallback (and for testing purposes), if the skb data is not
present in user memory (perhaps due to system misconfiguration), then
a seprate buffer is obtained from the copy_bgid and the data is
copied into user-memory.

Signed-off-by: Jonathan Lemon <[email protected]>
---
 include/uapi/linux/io_uring.h |   1 +
 io_uring/net.c                | 121 ++++++++++++
 io_uring/opdef.c              |  15 ++
 io_uring/zctap.c              | 340 ++++++++++++++++++++++++++++++++++
 io_uring/zctap.h              |  20 ++
 5 files changed, 497 insertions(+)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4d211d224c19..3d553c6662d1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -215,6 +215,7 @@ enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_RECV_ZC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/net.c b/io_uring/net.c
index 15dea91625e2..3a40e87afe54 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -16,6 +16,7 @@
 #include "net.h"
 #include "notif.h"
 #include "rsrc.h"
+#include "zctap.h"
 
 #if defined(CONFIG_NET)
 struct io_shutdown {
@@ -67,6 +68,12 @@ struct io_sr_msg {
 	struct io_kiocb 		*notif;
 };
 
+struct io_recvzc {
+	struct io_sr_msg		sr;
+	u32				datalen;
+	u16				copy_bgid;
+};
+
 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
 
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -908,6 +915,120 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	u64 recvzc_cmd;
+
+	/* XXX hack so we can temporarily use io_recvmsg_prep */
+	recvzc_cmd = READ_ONCE(sqe->addr3);
+
+	zc->copy_bgid = (recvzc_cmd >> 16) & 0xffff;
+	zc->datalen = recvzc_cmd >> 32;
+
+	return io_recvmsg_prep(req, sqe);
+}
+
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
+	struct zctap_read_desc zrd;
+	struct msghdr msg;
+	struct socket *sock;
+	struct iovec iov;
+	unsigned int cflags;
+	unsigned flags;
+	int ret, min_ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	size_t len = zc->sr.len;
+
+	if (!(req->flags & REQ_F_POLLED) &&
+	    (zc->sr.flags & IORING_RECVSEND_POLL_FIRST))
+		return -EAGAIN;
+
+	sock = sock_from_file(req->file);
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+retry_multishot:
+	if (io_do_buffer_select(req)) {
+		void __user *buf;
+
+		buf = io_buffer_select(req, &len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+		zc->sr.buf = buf;
+	}
+
+	ret = import_single_range(READ, zc->sr.buf, len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		goto out_free;
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+	msg.msg_get_inq = 1;
+	msg.msg_flags = 0;
+	msg.msg_controllen = 0;
+	msg.msg_iocb = NULL;
+	msg.msg_ubuf = NULL;
+
+	flags = zc->sr.msg_flags;
+	if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+	if (flags & MSG_WAITALL)
+		min_ret = iov_iter_count(&msg.msg_iter);
+
+	zrd = (struct zctap_read_desc) {
+		.iov_limit = msg_data_left(&msg),
+		.recv_limit = zc->datalen,
+		.iter = &msg.msg_iter,
+		.ctx = req->ctx,
+		.copy_bgid = zc->copy_bgid,
+	};
+
+	ret = io_zctap_recv(sock, &zrd, &msg, flags);
+	if (ret < min_ret) {
+		if (ret == -EAGAIN && force_nonblock) {
+			if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+
+			return -EAGAIN;
+		}
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
+		if (ret > 0 && io_net_retry(sock, flags)) {
+			zc->sr.len -= ret;
+			zc->sr.buf += ret;
+			zc->sr.done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
+		req_set_fail(req);
+	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+out_free:
+		req_set_fail(req);
+	}
+
+	if (ret > 0)
+		ret += zc->sr.done_io;
+	else if (zc->sr.done_io)
+		ret = zc->sr.done_io;
+	else
+		io_kbuf_recycle(req, issue_flags);
+
+	cflags = io_put_kbuf(req, issue_flags);
+	if (msg.msg_inq)
+		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+
+	if (!io_recv_finish(req, &ret, cflags, ret <= 0))
+		goto retry_multishot;
+
+	return ret;
+}
+
 void io_send_zc_cleanup(struct io_kiocb *req)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 83dc0f9ad3b2..14b42811a78e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
 #include "poll.h"
 #include "cancel.h"
 #include "rw.h"
+#include "zctap.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -521,6 +522,20 @@ const struct io_op_def io_op_defs[] = {
 		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_RECV_ZC] = {
+		.name			= "RECV_ZC",
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.buffer_select		= 1,
+		.ioprio			= 1,
+#if defined(CONFIG_NET)
+		.prep			= io_recvzc_prep,
+		.issue			= io_recvzc,
+#else
+		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 };
diff --git a/io_uring/zctap.c b/io_uring/zctap.c
index a84204c6eb96..32efa7e9199d 100644
--- a/io_uring/zctap.c
+++ b/io_uring/zctap.c
@@ -7,6 +7,7 @@
 #include <linux/io_uring.h>
 #include <linux/netdevice.h>
 #include <linux/nospec.h>
+#include <net/tcp.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -53,6 +54,11 @@ static u64 zctap_page_info(const struct page *page)
 	return page_private(page);
 }
 
+static u16 zctap_page_region_id(const struct page *page)
+{
+	return (zctap_page_info(page) >> 16) & 0xffff;
+}
+
 static u16 zctap_page_id(const struct page *page)
 {
 	return zctap_page_info(page) & 0xffff;
@@ -72,6 +78,14 @@ static bool zctap_page_ours(struct page *page)
 #define IO_ZCTAP_UREF		0x10000
 #define IO_ZCTAP_KREF_MASK	(IO_ZCTAP_UREF - 1)
 
+static void io_zctap_get_buf_uref(struct ifq_region *ifr, u16 pgid)
+{
+	if (WARN_ON(pgid >= ifr->nr_pages))
+		return;
+
+	atomic_add(IO_ZCTAP_UREF, &ifr->buf[pgid].refcount);
+}
+
 /* return user refs back, indicate whether buffer is reusable */
 static bool io_zctap_put_buf_uref(struct io_zctap_buf *buf)
 {
@@ -392,6 +406,18 @@ static void io_zctap_ifq_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 	}
 }
 
+static struct io_zctap_ifq *io_zctap_skb_ifq(struct sk_buff *skb)
+{
+	struct io_zctap_ifq_priv *priv;
+	struct ubuf_info *uarg = skb_zcopy(skb);
+
+	if (uarg && uarg->callback == io_zctap_ifq_callback) {
+		priv = container_of(uarg, struct io_zctap_ifq_priv, uarg);
+		return &priv->ifq;
+	}
+	return NULL;
+}
+
 static struct io_zctap_ifq *io_zctap_ifq_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_zctap_ifq_priv *priv;
@@ -485,3 +511,317 @@ void io_unregister_zctap_all(struct io_ring_ctx *ctx)
 	for (i = 0; i < NR_ZCTAP_IFQS; i++)
 		io_unregister_zctap_ifq(ctx, i);
 }
+
+static int __zctap_get_user_buffer(struct zctap_read_desc *zrd, int len)
+{
+	if (!zrd->buflen) {
+		zrd->req = (struct io_kiocb) {
+			.ctx = zrd->ctx,
+			.buf_index = zrd->copy_bgid,
+		};
+
+		zrd->buf = (u8 *)io_zctap_buffer(&zrd->req, &zrd->buflen);
+		zrd->offset = 0;
+	}
+	return len > zrd->buflen ? zrd->buflen : len;
+}
+
+static int zctap_copy_data(struct zctap_read_desc *zrd, int len, u8 *kaddr)
+{
+	struct io_uring_zctap_iov zov;
+	u32 space;
+	int err;
+
+	space = zrd->iov_space + sizeof(zov);
+	if (space > zrd->iov_limit)
+		return 0;
+
+	len = __zctap_get_user_buffer(zrd, len);
+	if (!len)
+		return -ENOBUFS;
+
+	err = copy_to_user(zrd->buf + zrd->offset, kaddr, len);
+	if (err)
+		return -EFAULT;
+
+	zov = (struct io_uring_zctap_iov) {
+		.off = zrd->offset,
+		.len = len,
+		.bgid = zrd->copy_bgid,
+		.bid = zrd->req.buf_index,
+	};
+
+	if (copy_to_iter(&zov, sizeof(zov), zrd->iter) != sizeof(zov))
+		return -EFAULT;
+
+	zrd->offset += len;
+	zrd->buflen -= len;
+	zrd->iov_space = space;
+
+	return len;
+}
+
+static int zctap_copy_frag(struct zctap_read_desc *zrd, struct page *page,
+			   int off, int len, struct io_uring_zctap_iov *zov)
+{
+	u8 *kaddr;
+	int err;
+
+	len = __zctap_get_user_buffer(zrd, len);
+	if (!len)
+		return -ENOBUFS;
+
+	kaddr = kmap(page) + off;
+	err = copy_to_user(zrd->buf + zrd->offset, kaddr, len);
+	kunmap(page);
+
+	if (err)
+		return -EFAULT;
+
+	*zov = (struct io_uring_zctap_iov) {
+		.off = zrd->offset,
+		.len = len,
+		.bgid = zrd->copy_bgid,
+		.bid = zrd->req.buf_index,
+	};
+
+	zrd->offset += len;
+	zrd->buflen -= len;
+
+	return len;
+}
+
+static int zctap_recv_frag(struct zctap_read_desc *zrd,
+			   struct io_zctap_ifq *ifq,
+			   const skb_frag_t *frag, int off, int len)
+{
+	struct io_uring_zctap_iov zov;
+	struct page *page;
+	u32 space;
+	int pgid;
+
+	space = zrd->iov_space + sizeof(zov);
+	if (space > zrd->iov_limit)
+		return 0;
+
+	page = skb_frag_page(frag);
+	off += skb_frag_off(frag);
+
+	if (likely(ifq && ifq->ctx == zrd->ctx && zctap_page_ours(page))) {
+		pgid = zctap_page_id(page);
+		io_zctap_get_buf_uref(ifq->region, pgid);
+		zov = (struct io_uring_zctap_iov) {
+			.off = off,
+			.len = len,
+			.bgid = zctap_page_region_id(page),
+			.bid = pgid,
+		};
+	} else {
+		len = zctap_copy_frag(zrd, page, off, len, &zov);
+		if (len <= 0)
+			return len;
+	}
+
+	if (copy_to_iter(&zov, sizeof(zov), zrd->iter) != sizeof(zov))
+		return -EFAULT;
+
+	zrd->iov_space = space;
+
+	return len;
+}
+
+/* Our version of __skb_datagram_iter  -- should work for UDP also. */
+static int
+zctap_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+	       unsigned int offset, size_t len)
+{
+	struct zctap_read_desc *zrd = desc->arg.data;
+	struct io_zctap_ifq *ifq;
+	unsigned start, start_off;
+	struct sk_buff *frag_iter;
+	int i, copy, end, off;
+	int ret = 0;
+
+	if (zrd->iov_space >= zrd->iov_limit) {
+		desc->count = 0;
+		return 0;
+	}
+	if (len > zrd->recv_limit)
+		len = zrd->recv_limit;
+
+	start = skb_headlen(skb);
+	start_off = offset;
+
+	ifq = io_zctap_skb_ifq(skb);
+
+	if (offset < start) {
+		copy = start - offset;
+		if (copy > len)
+			copy = len;
+
+		/* copy out linear data */
+		ret = zctap_copy_data(zrd, copy, skb->data + offset);
+		if (ret < 0)
+			goto out;
+		offset += ret;
+		len -= ret;
+		if (len == 0 || ret != copy)
+			goto out;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag;
+
+		WARN_ON(start > offset + len);
+
+		frag = &skb_shinfo(skb)->frags[i];
+		end = start + skb_frag_size(frag);
+
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zctap_recv_frag(zrd, ifq, frag, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if (offset < end) {
+			copy = end - offset;
+			if (copy > len)
+				copy = len;
+
+			off = offset - start;
+			ret = zctap_recv_skb(desc, frag_iter, off, copy);
+			if (ret < 0)
+				goto out;
+
+			offset += ret;
+			len -= ret;
+			if (len == 0 || ret != copy)
+				goto out;
+		}
+		start = end;
+	}
+
+out:
+	if (offset == start_off)
+		return ret;
+	return offset - start_off;
+}
+
+static int __io_zctap_tcp_read(struct sock *sk, struct zctap_read_desc *zrd)
+{
+	read_descriptor_t rd_desc = {
+		.arg.data = zrd,
+		.count = 1,
+	};
+
+	return tcp_read_sock(sk, &rd_desc, zctap_recv_skb);
+}
+
+static int io_zctap_tcp_recvmsg(struct sock *sk, struct zctap_read_desc *zrd,
+				int flags, int *addr_len)
+{
+	size_t used;
+	long timeo;
+	int ret;
+
+	ret = used = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	while (zrd->recv_limit) {
+		ret = __io_zctap_tcp_read(sk, zrd);
+		if (ret < 0)
+			break;
+		if (!ret) {
+			if (used)
+				break;
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
+			sk_wait_data(sk, &timeo, NULL);
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		zrd->recv_limit -= ret;
+		used += ret;
+
+		if (!timeo)
+			break;
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current))
+			break;
+	}
+
+	release_sock(sk);
+
+	/* XXX, handle timestamping */
+
+	if (used)
+		return used;
+
+	return ret;
+}
+
+int io_zctap_recv(struct socket *sock, struct zctap_read_desc *zrd,
+		  struct msghdr *msg, unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	const struct proto *prot;
+	int addr_len = 0;
+	int ret;
+
+	if (flags & MSG_ERRQUEUE)
+		return -EOPNOTSUPP;
+
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->recvmsg != tcp_recvmsg)
+		return -EPROTONOSUPPORT;
+
+	sock_rps_record_flow(sk);
+
+	ret = io_zctap_tcp_recvmsg(sk, zrd, flags, &addr_len);
+	if (ret >= 0) {
+		msg->msg_namelen = addr_len;
+		ret = zrd->iov_space;
+	}
+	return ret;
+}
diff --git a/io_uring/zctap.h b/io_uring/zctap.h
index bb44f8e972e8..4db516707d19 100644
--- a/io_uring/zctap.h
+++ b/io_uring/zctap.h
@@ -2,10 +2,30 @@
 #ifndef IOU_ZCTAP_H
 #define IOU_ZCTAP_H
 
+struct zctap_read_desc {
+	struct io_ring_ctx *ctx;
+	struct iov_iter *iter;
+	u32 iov_space;
+	u32 iov_limit;
+	u32 recv_limit;
+
+	struct io_kiocb req;
+	u8 *buf;
+	size_t offset;
+	size_t buflen;
+
+	u16 copy_bgid;			/* XXX move to register ifq? */
+};
+
 int io_register_ifq(struct io_ring_ctx *ctx,
 		    struct io_uring_ifq_req __user *arg);
 void io_unregister_zctap_all(struct io_ring_ctx *ctx);
 
 int io_provide_ifq_region(struct io_zctap_ifq *ifq, u16 id);
 
+int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
+int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_zctap_recv(struct socket *sock, struct zctap_read_desc *zrd,
+		  struct msghdr *msg, unsigned int flags);
+
 #endif
-- 
2.30.2


  parent reply	other threads:[~2022-11-02 23:40 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-02 23:32 [RFC PATCH v3 00/15] zero-copy RX for io_uring Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 01/15] io_uring: add zctap ifq definition Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 02/15] netdevice: add SETUP_ZCTAP to the netdev_bpf structure Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 03/15] io_uring: add register ifq opcode Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 04/15] io_uring: create a zctap region for a mapped buffer Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 05/15] io_uring: mark pages in ifq region with zctap information Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 06/15] io_uring: Provide driver API for zctap packet buffers Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 07/15] io_uring: Allocate zctap device buffers and dma map them Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 08/15] io_uring: Add zctap buffer get/put functions and refcounting Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 09/15] skbuff: Introduce SKBFL_FIXED_FRAG and skb_fixed() Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 10/15] io_uring: Allocate a uarg for use by the ifq RX Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 11/15] io_uring: Define the zctap iov[] returned to the user Jonathan Lemon
2022-11-02 23:32 ` Jonathan Lemon [this message]
2022-11-02 23:32 ` [RFC PATCH v3 13/15] io_uring: Make remove_ifq_region a delayed work call Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 14/15] io_uring: Add a buffer caching mechanism for zctap Jonathan Lemon
2022-11-02 23:32 ` [RFC PATCH v3 15/15] io_uring: Notify the application as the fillq is drained Jonathan Lemon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox