public inbox for [email protected]
 help / color / mirror / Atom feed
From: David Wei <[email protected]>
To: [email protected], [email protected]
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	Jesper Dangaard Brouer <[email protected]>,
	David Ahern <[email protected]>,
	Mina Almasry <[email protected]>,
	Willem de Bruijn <[email protected]>,
	Dragos Tatulea <[email protected]>
Subject: [PATCH 05/20] io_uring/zcrx: implement socket registration
Date: Tue,  7 Nov 2023 13:40:30 -0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: Pavel Begunkov <[email protected]>

We want userspace to explicitly list all sockets it'll be using with a
particular zc ifq, so we can properly configure them, e.g. binding the
sockets to the corresponding interface and setting steering rules. We'll
also need it to better control ifq lifetime and for
termination / unregistration purposes.

TODO: remove zc_rx_idx from struct socket, and uapi is likely to change

Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: David Wei <[email protected]>
---
 include/linux/net.h           |  2 ++
 include/uapi/linux/io_uring.h |  7 ++++
 io_uring/io_uring.c           |  6 ++++
 io_uring/net.c                | 19 +++++++++++
 io_uring/zc_rx.c              | 63 +++++++++++++++++++++++++++++++++++
 io_uring/zc_rx.h              | 17 ++++++++++
 net/socket.c                  |  1 +
 7 files changed, 115 insertions(+)

diff --git a/include/linux/net.h b/include/linux/net.h
index c9b4a63791a4..867061a91d30 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -126,6 +126,8 @@ struct socket {
 	const struct proto_ops	*ops; /* Might change with IPV6_ADDRFORM or MPTCP. */
 
 	struct socket_wq	wq;
+
+	unsigned		zc_rx_idx;
 };
 
 /*
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ae5608bcd785..917d0025cc94 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -550,6 +550,7 @@ enum {
 
 	/* register a network interface queue for zerocopy */
 	IORING_REGISTER_ZC_RX_IFQ		= 26,
+	IORING_REGISTER_ZC_RX_SOCK		= 27,
 
 	/* this goes last */
 	IORING_REGISTER_LAST,
@@ -788,6 +789,12 @@ struct io_uring_zc_rx_ifq_reg {
 	struct io_rbuf_cqring_offsets cq_off;
 };
 
+struct io_uring_zc_rx_sock_reg {
+	__u32	sockfd;
+	__u32	zc_rx_ifq_idx;
+	__u32	__resv[2];
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f06e9ed397da..e24e2c308a8a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4549,6 +4549,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_zc_rx_ifq(ctx, arg);
 		break;
+	case IORING_REGISTER_ZC_RX_SOCK:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_zc_rx_sock(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 7a8e298af81b..fc0b7936971d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -955,6 +955,25 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 	return ret;
 }
 
+static __maybe_unused
+struct io_zc_rx_ifq *io_zc_verify_sock(struct io_kiocb *req,
+					struct socket *sock)
+{
+	unsigned token = READ_ONCE(sock->zc_rx_idx);
+	unsigned ifq_idx = token >> IO_ZC_IFQ_IDX_OFFSET;
+	unsigned sock_idx = token & IO_ZC_IFQ_IDX_MASK;
+	struct io_zc_rx_ifq *ifq;
+
+	if (ifq_idx)
+		return NULL;
+	ifq = req->ctx->ifq;
+	if (!ifq || sock_idx >= ifq->nr_sockets)
+		return NULL;
+	if (ifq->sockets[sock_idx] != req->file)
+		return NULL;
+	return ifq;
+}
+
 void io_send_zc_cleanup(struct io_kiocb *req)
 {
 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
diff --git a/io_uring/zc_rx.c b/io_uring/zc_rx.c
index 85180c3044d8..b5266a67395e 100644
--- a/io_uring/zc_rx.c
+++ b/io_uring/zc_rx.c
@@ -11,6 +11,7 @@
 #include "io_uring.h"
 #include "kbuf.h"
 #include "zc_rx.h"
+#include "rsrc.h"
 
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 
@@ -129,12 +130,74 @@ int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx)
 {
 	struct io_zc_rx_ifq *ifq = ctx->ifq;
+	int i;
 
 	if (!ifq)
 		return -EINVAL;
 
+	for (i = 0; i < ifq->nr_sockets; i++)
+		fput(ifq->sockets[i]);
+
 	ctx->ifq = NULL;
 	io_zc_rx_ifq_free(ifq);
 	return 0;
 }
+
+int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
+			   struct io_uring_zc_rx_sock_reg __user *arg)
+{
+	struct io_uring_zc_rx_sock_reg sr;
+	struct io_zc_rx_ifq *ifq;
+	struct socket *sock;
+	struct file *file;
+	int ret = -EEXIST;
+	int idx;
+
+	if (copy_from_user(&sr, arg, sizeof(sr)))
+		return -EFAULT;
+	if (sr.__resv[0] || sr.__resv[1])
+		return -EINVAL;
+	if (sr.zc_rx_ifq_idx != 0 || !ctx->ifq)
+		return -EINVAL;
+
+	ifq = ctx->ifq;
+	if (ifq->nr_sockets >= ARRAY_SIZE(ifq->sockets))
+		return -EINVAL;
+
+	BUILD_BUG_ON(ARRAY_SIZE(ifq->sockets) > IO_ZC_IFQ_IDX_MASK);
+
+	file = fget(sr.sockfd);
+	if (!file)
+		return -EBADF;
+
+	if (io_file_need_scm(file)) {
+		fput(file);
+		return -EBADF;
+	}
+
+	sock = sock_from_file(file);
+	if (unlikely(!sock || !sock->sk)) {
+		fput(file);
+		return -ENOTSOCK;
+	}
+
+	idx = ifq->nr_sockets;
+	lock_sock(sock->sk);
+	if (!sock->zc_rx_idx) {
+		unsigned token;
+
+		token = idx + (sr.zc_rx_ifq_idx << IO_ZC_IFQ_IDX_OFFSET);
+		WRITE_ONCE(sock->zc_rx_idx, token);
+		ret = 0;
+	}
+	release_sock(sock->sk);
+
+	if (ret) {
+		fput(file);
+		return -EINVAL;
+	}
+	ifq->sockets[idx] = file;
+	ifq->nr_sockets++;
+	return 0;
+}
 #endif
diff --git a/io_uring/zc_rx.h b/io_uring/zc_rx.h
index 5f6d80c1c2b8..ab25f8dbb433 100644
--- a/io_uring/zc_rx.h
+++ b/io_uring/zc_rx.h
@@ -2,6 +2,13 @@
 #ifndef IOU_ZC_RX_H
 #define IOU_ZC_RX_H
 
+#include <linux/io_uring_types.h>
+#include <linux/skbuff.h>
+
+#define IO_ZC_MAX_IFQ_SOCKETS		16
+#define IO_ZC_IFQ_IDX_OFFSET		16
+#define IO_ZC_IFQ_IDX_MASK		((1U << IO_ZC_IFQ_IDX_OFFSET) - 1)
+
 struct io_zc_rx_ifq {
 	struct io_ring_ctx	*ctx;
 	struct net_device	*dev;
@@ -11,6 +18,9 @@ struct io_zc_rx_ifq {
 	u32			rq_entries, cq_entries;
 	void			*pool;
 
+	unsigned		nr_sockets;
+	struct file		*sockets[IO_ZC_MAX_IFQ_SOCKETS];
+
 	/* hw rx descriptor ring id */
 	u32			if_rxq_id;
 };
@@ -19,6 +29,8 @@ struct io_zc_rx_ifq {
 int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zc_rx_ifq_reg __user *arg);
 int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx);
+int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
+			   struct io_uring_zc_rx_sock_reg __user *arg);
 #else
 static inline int io_register_zc_rx_ifq(struct io_ring_ctx *ctx,
 			  struct io_uring_zc_rx_ifq_reg __user *arg)
@@ -29,6 +41,11 @@ static inline int io_unregister_zc_rx_ifq(struct io_ring_ctx *ctx)
 {
 	return -EOPNOTSUPP;
 }
+static inline int io_register_zc_rx_sock(struct io_ring_ctx *ctx,
+				struct io_uring_zc_rx_sock_reg __user *arg)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #endif
diff --git a/net/socket.c b/net/socket.c
index c4a6f5532955..419b7bda3f9c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -637,6 +637,7 @@ struct socket *sock_alloc(void)
 
 	sock = SOCKET_I(inode);
 
+	sock->zc_rx_idx = 0;
 	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 	inode->i_uid = current_fsuid();
-- 
2.39.3


  parent reply	other threads:[~2023-11-07 21:41 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-07 21:40 [RFC PATCH v2 00/20] Zero copy Rx using io_uring David Wei
2023-11-07 21:40 ` [PATCH 01/20] io_uring: add interface queue David Wei
2023-11-07 21:40 ` [PATCH 02/20] io_uring: add mmap support for shared ifq ringbuffers David Wei
2023-11-07 21:40 ` [PATCH 03/20] netdev: add XDP_SETUP_ZC_RX command David Wei
2023-11-07 21:40 ` [PATCH 04/20] io_uring: setup ZC for an Rx queue when registering an ifq David Wei
2023-11-07 21:40 ` David Wei [this message]
2023-11-07 21:40 ` [PATCH 06/20] io_uring: add ZC buf and pool David Wei
2023-11-07 21:40 ` [PATCH 07/20] io_uring: add ZC pool API David Wei
2023-11-07 21:40 ` [PATCH 08/20] skbuff: add SKBFL_FIXED_FRAG and skb_fixed() David Wei
2023-11-07 21:40 ` [PATCH 09/20] io_uring: allocate a uarg for freeing zero copy skbs David Wei
2023-11-07 21:40 ` [PATCH 10/20] io_uring: delay ZC pool destruction David Wei
2023-11-07 21:40 ` [PATCH 11/20] net: add data pool David Wei
2023-11-07 21:40 ` [PATCH 12/20] io_uring: add io_recvzc request David Wei
2023-11-07 21:40 ` [PATCH 13/20] io_uring/zcrx: propagate ifq down the stack David Wei
2023-11-07 21:40 ` [PATCH 14/20] io_uring/zcrx: introduce io_zc_get_rbuf_cqe David Wei
2023-11-07 21:40 ` [PATCH 15/20] io_uring/zcrx: add copy fallback David Wei
2023-11-07 21:40 ` [PATCH 16/20] net: execute custom callback from napi David Wei
2023-11-07 21:40 ` [PATCH 17/20] io_uring/zcrx: copy fallback to ring buffers David Wei
2023-11-07 21:40 ` [PATCH 18/20] veth: add support for io_uring zc rx David Wei
2023-11-07 21:40 ` [PATCH 19/20] bnxt: use data pool David Wei
2023-11-07 21:40 ` [PATCH 20/20] io_uring/zcrx: add multi socket support per Rx queue David Wei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox