public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCHSET v3 0/8] Support for provided buffers for send
@ 2024-02-25  0:35 Jens Axboe
  2024-02-25  0:35 ` [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr Jens Axboe
                   ` (7 more replies)
  0 siblings, 8 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring

Hi,

We never supported provided buffers for sends, because it didn't seem
to make a lot of sense. But it actually does make a lot of sense! If
an app is receiving data, doing something with it, and then sending
either the same or another buffer out based on that, then if we use
provided buffers for sends we can guarantee that the sends are
serialized. This is because provided buffer rings are FIFO ordered,
as it's a ring buffer, and hence it doesn't really matter if you
have more than one send inflight.

This provides a nice efficiency win, but more importantly, it reduces
the complexity in the application as it no longer needs to track a
potential backlog of sends. The app just sets up a send based buffer
ring, exactly like it does for incoming data. And that's it, no more
dealing with serialized sends.

In some testing with proxy [1], in basic shuffling of packets I see
a 36% improvement with this over manually dealing with sends. That's
a pretty big win on top of making the app simpler. Using multishot
further brings a nice improvement on top. Need to rebench on the 10G
setup, in a vm it's looking pretty tasty.

You can also find the patches here:

https://git.kernel.dk/cgit/linux/log/?h=io_uring-send-queue

[1] https://git.kernel.dk/cgit/liburing/tree/examples/proxy.c

Changes since v2:

- Add prep patches unifying send/recv header handling
- Add multishot mode as well
- Fix issue with REQ_F_BL_EMPTY on ring provided buffers

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-26 14:41   ` Pavel Begunkov
  2024-02-25  0:35 ` [PATCH 2/8] net: remove {revc,send}msg_copy_msghdr() from exports Jens Axboe
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For recvmsg, we roll our own since we support buffer selections. This
isn't the case for sendmsg right now, but in preparation for doing so,
make the recvmsg copy helpers generic so we can call them from the
sendmsg side as well.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/net.c | 275 +++++++++++++++++++++++++------------------------
 1 file changed, 140 insertions(+), 135 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 161622029147..fcbaeb7cc045 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -204,16 +204,148 @@ static int io_setup_async_msg(struct io_kiocb *req,
 	return -EAGAIN;
 }
 
+static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+{
+	int hdr;
+
+	if (iomsg->namelen < 0)
+		return true;
+	if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
+			       iomsg->namelen, &hdr))
+		return true;
+	if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_COMPAT
+static int __io_compat_msg_copy_hdr(struct io_kiocb *req,
+				    struct io_async_msghdr *iomsg,
+				    struct sockaddr __user **addr, int ddir)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+	struct compat_msghdr msg;
+	struct compat_iovec __user *uiov;
+	int ret;
+
+	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
+		return -EFAULT;
+
+	ret = __get_compat_msghdr(&iomsg->msg, &msg, addr);
+	if (ret)
+		return ret;
+
+	uiov = compat_ptr(msg.msg_iov);
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		compat_ssize_t clen;
+
+		iomsg->free_iov = NULL;
+		if (msg.msg_iovlen == 0) {
+			sr->len = 0;
+		} else if (msg.msg_iovlen > 1) {
+			return -EINVAL;
+		} else {
+			if (!access_ok(uiov, sizeof(*uiov)))
+				return -EFAULT;
+			if (__get_user(clen, &uiov->iov_len))
+				return -EFAULT;
+			if (clen < 0)
+				return -EINVAL;
+			sr->len = clen;
+		}
+
+		if (ddir == ITER_DEST && req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
+	} else {
+		iomsg->free_iov = iomsg->fast_iov;
+		ret = __import_iovec(ddir, (struct iovec __user *)uiov,
+				     msg.msg_iovlen, UIO_FASTIOV,
+				     &iomsg->free_iov, &iomsg->msg.msg_iter,
+				     true);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+#endif
+
+static int __io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
+			     struct sockaddr __user **addr, int ddir)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+	struct user_msghdr msg;
+	int ret;
+
+	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
+		return -EFAULT;
+
+	ret = __copy_msghdr(&iomsg->msg, &msg, addr);
+	if (ret)
+		return ret;
+
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		if (msg.msg_iovlen == 0) {
+			sr->len = iomsg->fast_iov[0].iov_len = 0;
+			iomsg->fast_iov[0].iov_base = NULL;
+			iomsg->free_iov = NULL;
+		} else if (msg.msg_iovlen > 1) {
+			return -EINVAL;
+		} else {
+			if (copy_from_user(iomsg->fast_iov, msg.msg_iov,
+					   sizeof(*msg.msg_iov)))
+				return -EFAULT;
+			sr->len = iomsg->fast_iov[0].iov_len;
+			iomsg->free_iov = NULL;
+		}
+
+		if (ddir == ITER_DEST && req->flags & REQ_F_APOLL_MULTISHOT) {
+			iomsg->namelen = msg.msg_namelen;
+			iomsg->controllen = msg.msg_controllen;
+			if (io_recvmsg_multishot_overflow(iomsg))
+				return -EOVERFLOW;
+		}
+	} else {
+		iomsg->free_iov = iomsg->fast_iov;
+		ret = __import_iovec(ddir, msg.msg_iov, msg.msg_iovlen,
+				     UIO_FASTIOV, &iomsg->free_iov,
+				     &iomsg->msg.msg_iter, false);
+		if (ret > 0)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
+			   struct sockaddr __user **addr, int ddir)
+{
+	iomsg->msg.msg_name = &iomsg->addr;
+	iomsg->msg.msg_iter.nr_segs = 0;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		return __io_compat_msg_copy_hdr(req, iomsg, addr, ddir);
+#endif
+
+	return __io_msg_copy_hdr(req, iomsg, addr, ddir);
+}
+
 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 			       struct io_async_msghdr *iomsg)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	int ret;
 
-	iomsg->msg.msg_name = &iomsg->addr;
-	iomsg->free_iov = iomsg->fast_iov;
-	ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
-					&iomsg->free_iov);
+	ret = io_msg_copy_hdr(req, iomsg, NULL, ITER_SOURCE);
+	if (ret)
+		return ret;
+
 	/* save msg_control as sys_sendmsg() overwrites it */
 	sr->msg_control = iomsg->msg.msg_control_user;
 	return ret;
@@ -435,142 +567,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
-{
-	int hdr;
-
-	if (iomsg->namelen < 0)
-		return true;
-	if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
-			       iomsg->namelen, &hdr))
-		return true;
-	if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
-		return true;
-
-	return false;
-}
-
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
-				 struct io_async_msghdr *iomsg)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-	struct user_msghdr msg;
-	int ret;
-
-	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
-		return -EFAULT;
-
-	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
-	if (ret)
-		return ret;
-
-	if (req->flags & REQ_F_BUFFER_SELECT) {
-		if (msg.msg_iovlen == 0) {
-			sr->len = iomsg->fast_iov[0].iov_len = 0;
-			iomsg->fast_iov[0].iov_base = NULL;
-			iomsg->free_iov = NULL;
-		} else if (msg.msg_iovlen > 1) {
-			return -EINVAL;
-		} else {
-			if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
-				return -EFAULT;
-			sr->len = iomsg->fast_iov[0].iov_len;
-			iomsg->free_iov = NULL;
-		}
-
-		if (req->flags & REQ_F_APOLL_MULTISHOT) {
-			iomsg->namelen = msg.msg_namelen;
-			iomsg->controllen = msg.msg_controllen;
-			if (io_recvmsg_multishot_overflow(iomsg))
-				return -EOVERFLOW;
-		}
-	} else {
-		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
-				     &iomsg->free_iov, &iomsg->msg.msg_iter,
-				     false);
-		if (ret > 0)
-			ret = 0;
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
-					struct io_async_msghdr *iomsg)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-	struct compat_msghdr msg;
-	struct compat_iovec __user *uiov;
-	int ret;
-
-	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
-		return -EFAULT;
-
-	ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
-	if (ret)
-		return ret;
-
-	uiov = compat_ptr(msg.msg_iov);
-	if (req->flags & REQ_F_BUFFER_SELECT) {
-		compat_ssize_t clen;
-
-		iomsg->free_iov = NULL;
-		if (msg.msg_iovlen == 0) {
-			sr->len = 0;
-		} else if (msg.msg_iovlen > 1) {
-			return -EINVAL;
-		} else {
-			if (!access_ok(uiov, sizeof(*uiov)))
-				return -EFAULT;
-			if (__get_user(clen, &uiov->iov_len))
-				return -EFAULT;
-			if (clen < 0)
-				return -EINVAL;
-			sr->len = clen;
-		}
-
-		if (req->flags & REQ_F_APOLL_MULTISHOT) {
-			iomsg->namelen = msg.msg_namelen;
-			iomsg->controllen = msg.msg_controllen;
-			if (io_recvmsg_multishot_overflow(iomsg))
-				return -EOVERFLOW;
-		}
-	} else {
-		iomsg->free_iov = iomsg->fast_iov;
-		ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen,
-				   UIO_FASTIOV, &iomsg->free_iov,
-				   &iomsg->msg.msg_iter, true);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
-}
-#endif
-
-static int io_recvmsg_copy_hdr(struct io_kiocb *req,
-			       struct io_async_msghdr *iomsg)
-{
-	iomsg->msg.msg_name = &iomsg->addr;
-	iomsg->msg.msg_iter.nr_segs = 0;
-
-#ifdef CONFIG_COMPAT
-	if (req->ctx->compat)
-		return __io_compat_recvmsg_copy_hdr(req, iomsg);
-#endif
-
-	return __io_recvmsg_copy_hdr(req, iomsg);
-}
-
 int io_recvmsg_prep_async(struct io_kiocb *req)
 {
+	struct io_async_msghdr *iomsg;
 	int ret;
 
 	if (!io_msg_alloc_async_prep(req))
 		return -ENOMEM;
-	ret = io_recvmsg_copy_hdr(req, req->async_data);
+	iomsg = req->async_data;
+	ret = io_msg_copy_hdr(req, iomsg, &iomsg->uaddr, ITER_DEST);
 	if (!ret)
 		req->flags |= REQ_F_NEED_CLEANUP;
 	return ret;
@@ -793,7 +798,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (req_has_async_data(req)) {
 		kmsg = req->async_data;
 	} else {
-		ret = io_recvmsg_copy_hdr(req, &iomsg);
+		ret = io_msg_copy_hdr(req, &iomsg, &iomsg.uaddr, ITER_DEST);
 		if (ret)
 			return ret;
 		kmsg = &iomsg;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 2/8] net: remove {revc,send}msg_copy_msghdr() from exports
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
  2024-02-25  0:35 ` [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-25  0:35 ` [PATCH 3/8] io_uring/net: add provided buffer support for IORING_OP_SEND Jens Axboe
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

The only user of these was io_uring, and it's not using them anymore.
Make them static and remove them from the socket header file.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/socket.h                         |  7 -------
 net/socket.c                                   | 14 +++++++-------
 tools/perf/trace/beauty/include/linux/socket.h |  7 -------
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index cfcb7e2c3813..139c330ccf2c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -422,13 +422,6 @@ extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
 			       struct user_msghdr __user *umsg,
 			       struct sockaddr __user *uaddr,
 			       unsigned int flags);
-extern int sendmsg_copy_msghdr(struct msghdr *msg,
-			       struct user_msghdr __user *umsg, unsigned flags,
-			       struct iovec **iov);
-extern int recvmsg_copy_msghdr(struct msghdr *msg,
-			       struct user_msghdr __user *umsg, unsigned flags,
-			       struct sockaddr __user **uaddr,
-			       struct iovec **iov);
 extern int __copy_msghdr(struct msghdr *kmsg,
 			 struct user_msghdr *umsg,
 			 struct sockaddr __user **save_addr);
diff --git a/net/socket.c b/net/socket.c
index ed3df2f749bf..0f5d5079fd91 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2600,9 +2600,9 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
 	return err;
 }
 
-int sendmsg_copy_msghdr(struct msghdr *msg,
-			struct user_msghdr __user *umsg, unsigned flags,
-			struct iovec **iov)
+static int sendmsg_copy_msghdr(struct msghdr *msg,
+			       struct user_msghdr __user *umsg, unsigned flags,
+			       struct iovec **iov)
 {
 	int err;
 
@@ -2753,10 +2753,10 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
 	return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
 }
 
-int recvmsg_copy_msghdr(struct msghdr *msg,
-			struct user_msghdr __user *umsg, unsigned flags,
-			struct sockaddr __user **uaddr,
-			struct iovec **iov)
+static int recvmsg_copy_msghdr(struct msghdr *msg,
+			       struct user_msghdr __user *umsg, unsigned flags,
+			       struct sockaddr __user **uaddr,
+			       struct iovec **iov)
 {
 	ssize_t err;
 
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index cfcb7e2c3813..139c330ccf2c 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -422,13 +422,6 @@ extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
 			       struct user_msghdr __user *umsg,
 			       struct sockaddr __user *uaddr,
 			       unsigned int flags);
-extern int sendmsg_copy_msghdr(struct msghdr *msg,
-			       struct user_msghdr __user *umsg, unsigned flags,
-			       struct iovec **iov);
-extern int recvmsg_copy_msghdr(struct msghdr *msg,
-			       struct user_msghdr __user *umsg, unsigned flags,
-			       struct sockaddr __user **uaddr,
-			       struct iovec **iov);
 extern int __copy_msghdr(struct msghdr *kmsg,
 			 struct user_msghdr *umsg,
 			 struct sockaddr __user **save_addr);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 3/8] io_uring/net: add provided buffer support for IORING_OP_SEND
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
  2024-02-25  0:35 ` [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr Jens Axboe
  2024-02-25  0:35 ` [PATCH 2/8] net: remove {revc,send}msg_copy_msghdr() from exports Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-25  0:35 ` [PATCH 4/8] io_uring/net: add provided buffer support for IORING_OP_SENDMSG Jens Axboe
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

It's pretty trivial to wire up provided buffer support for the send
side, just like we do on the receive side. This enables setting up
a buffer ring that an application can use to push pending sends to,
and then have a send pick a buffer from that ring.

One of the challenges with async IO and networking sends is that you
can get into reordering conditions if you have more than one inflight
at the same time. Consider the following scenario where everything is
fine:

1) App queues sendA for socket1
2) App queues sendB for socket1
3) App does io_uring_submit()
4) sendA is issued, completes successfully, posts CQE
5) sendB is issued, completes successfully, posts CQE

All is fine. Requests are always issued in-order, and both complete
inline as most sends do.

However, if we're flooding socket1 with sends, the following could
also result from the same sequence:

1) App queues sendA for socket1
2) App queues sendB for socket1
3) App does io_uring_submit()
4) sendA is issued, socket1 is full, poll is armed for retry
5) Space frees up in socket1, this triggers sendA retry via task_work
6) sendB is issued, completes successfully, posts CQE
7) sendA is retried, completes successfully, posts CQE

Now we've sent sendB before sendA, which can make things unhappy. If
both sendA and sendB had been using provided buffers, then it would look
as follows instead:

1) App queues dataA for sendA, queues sendA for socket1
2) App queues dataB for sendB queues sendB for socket1
3) App does io_uring_submit()
4) sendA is issued, socket1 is full, poll is armed for retry
5) Space frees up in socket1, this triggers sendA retry via task_work
6) sendB is issued, picks first buffer (dataA), completes successfully,
   posts CQE (which says "I sent dataA")
7) sendA is retried, picks first buffer (dataB), completes successfully,
   posts CQE (which says "I sent dataB")

Now we've sent the data in order, and everybody is happy.

It's worth noting that this also opens the door for supporting multishot
sends, as provided buffers would be a prerequisite for that. Those can
trigger either when new buffers are added to the outgoing ring, or (if
stalled due to lack of space) when space frees up in the socket.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/io_uring.c           |  3 ++-
 io_uring/net.c                | 19 ++++++++++++++++---
 io_uring/opdef.c              |  1 +
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02b..74c3afac9c63 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -522,6 +522,7 @@ struct io_uring_params {
 #define IORING_FEAT_CQE_SKIP		(1U << 11)
 #define IORING_FEAT_LINKED_FILE		(1U << 12)
 #define IORING_FEAT_REG_REG_RING	(1U << 13)
+#define IORING_FEAT_SEND_BUFS		(1U << 14)
 
 /*
  * io_uring_register(2) opcodes and arguments
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cf2f514b7cc0..f6332fc56bed 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3962,7 +3962,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
 			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
-			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
+			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
+			IORING_FEAT_SEND_BUFS;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/io_uring/net.c b/io_uring/net.c
index fcbaeb7cc045..10b6d8caf4da 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -489,7 +489,8 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	io_req_set_res(req, ret, 0);
+	cflags = io_put_kbuf(req, issue_flags);
+	io_req_set_res(req, ret, cflags);
 	return IOU_OK;
 }
 
@@ -497,8 +498,10 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct sockaddr_storage __address;
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-	struct msghdr msg;
+	size_t len = sr->len;
+	unsigned int cflags;
 	struct socket *sock;
+	struct msghdr msg;
 	unsigned flags;
 	int min_ret = 0;
 	int ret;
@@ -531,7 +534,17 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
+	if (io_do_buffer_select(req)) {
+		void __user *buf;
+
+		buf = io_buffer_select(req, &len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+		sr->buf = buf;
+		sr->len = len;
+	}
+
+	ret = import_ubuf(ITER_SOURCE, sr->buf, len, &msg.msg_iter);
 	if (unlikely(ret))
 		return ret;
 
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9c080aadc5a6..88fbe5cfd379 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -273,6 +273,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.audit_skip		= 1,
 		.ioprio			= 1,
 		.manual_alloc		= 1,
+		.buffer_select		= 1,
 #if defined(CONFIG_NET)
 		.prep			= io_sendmsg_prep,
 		.issue			= io_send,
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 4/8] io_uring/net: add provided buffer support for IORING_OP_SENDMSG
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
                   ` (2 preceding siblings ...)
  2024-02-25  0:35 ` [PATCH 3/8] io_uring/net: add provided buffer support for IORING_OP_SEND Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-25  0:35 ` [PATCH 5/8] io_uring/kbuf: flag request if buffer pool is empty after buffer pick Jens Axboe
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Adds provided buffer support for sendmsg as well, see the previous commit
that added it to IORING_OP_SEND for a longer explanation of why this
makes sense.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/net.c   | 15 ++++++++++++++-
 io_uring/opdef.c |  1 +
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 10b6d8caf4da..30afb394efd7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -436,6 +436,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
+	unsigned int cflags;
 	unsigned flags;
 	int min_ret = 0;
 	int ret;
@@ -458,6 +459,17 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg, issue_flags);
 
+	if (io_do_buffer_select(req)) {
+		void __user *buf;
+		size_t len = sr->len;
+
+		buf = io_buffer_select(req, &len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+
+		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_SOURCE, buf, len);
+	}
+
 	flags = sr->msg_flags;
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
@@ -576,7 +588,8 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	io_req_set_res(req, ret, 0);
+	cflags = io_put_kbuf(req, issue_flags);
+	io_req_set_res(req, ret, cflags);
 	return IOU_OK;
 }
 
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 88fbe5cfd379..1f6b09e61ef8 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -139,6 +139,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.pollout		= 1,
 		.ioprio			= 1,
 		.manual_alloc		= 1,
+		.buffer_select		= 1,
 #if defined(CONFIG_NET)
 		.prep			= io_sendmsg_prep,
 		.issue			= io_sendmsg,
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 5/8] io_uring/kbuf: flag request if buffer pool is empty after buffer pick
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
                   ` (3 preceding siblings ...)
  2024-02-25  0:35 ` [PATCH 4/8] io_uring/net: add provided buffer support for IORING_OP_SENDMSG Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-25  0:35 ` [PATCH 6/8] io_uring/net: support multishot for send Jens Axboe
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Normally we do an extra roundtrip for retries even if the buffer pool has
depleted, as we don't check that upfront. Rather than add this check, have
the buffer selection methods mark the request with REQ_F_BL_EMPTY if the
used buffer group is out of buffers after this selection. This is very
cheap to do once we're all the way inside there anyway, and it gives the
caller a chance to make better decisions on how to proceed.

For example, recv/recvmsg multishot could check this flag when it
decides whether to keep receiving or not.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |  3 +++
 io_uring/kbuf.c                | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bd7071aeec5d..d8111d64812b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -480,6 +480,7 @@ enum {
 	REQ_F_POLL_NO_LAZY_BIT,
 	REQ_F_CANCEL_SEQ_BIT,
 	REQ_F_CAN_POLL_BIT,
+	REQ_F_BL_EMPTY_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -556,6 +557,8 @@ enum {
 	REQ_F_CANCEL_SEQ	= IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
 	/* file is pollable */
 	REQ_F_CAN_POLL		= IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
+	/* buffer list was empty after selection of buffer */
+	REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index ee866d646997..3d257ed9031b 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -139,6 +139,8 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 		list_del(&kbuf->list);
 		if (*len == 0 || *len > kbuf->len)
 			*len = kbuf->len;
+		if (list_empty(&bl->buf_list))
+			req->flags |= REQ_F_BL_EMPTY;
 		req->flags |= REQ_F_BUFFER_SELECTED;
 		req->kbuf = kbuf;
 		req->buf_index = kbuf->bid;
@@ -152,12 +154,16 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 					  unsigned int issue_flags)
 {
 	struct io_uring_buf_ring *br = bl->buf_ring;
+	__u16 tail, head = bl->head;
 	struct io_uring_buf *buf;
-	__u16 head = bl->head;
 
-	if (unlikely(smp_load_acquire(&br->tail) == head))
+	tail = smp_load_acquire(&br->tail);
+	if (unlikely(tail == head))
 		return NULL;
 
+	if (head + 1 == tail)
+		req->flags |= REQ_F_BL_EMPTY;
+
 	head &= bl->mask;
 	/* mmaped buffers are always contig */
 	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
                   ` (4 preceding siblings ...)
  2024-02-25  0:35 ` [PATCH 5/8] io_uring/kbuf: flag request if buffer pool is empty after buffer pick Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-26 10:47   ` Dylan Yudaken
  2024-02-25  0:35 ` [PATCH 7/8] io_uring/net: support multishot for sendmsg Jens Axboe
  2024-02-25  0:35 ` [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more Jens Axboe
  7 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This works very much like the receive side, except for sends. The idea
is that an application can fill outgoing buffers in a provided buffer
group, and then arm a single send that will service them all. For now
this variant just terminates when we are out of buffers to send, and
hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
set, as per usual for multishot requests.

This only enables it for IORING_OP_SEND, IORING_OP_SENDMSG is coming
in a separate patch. However, this patch does do a lot of the prep
work that makes wiring up the sendmsg variant pretty trivial. They
share the prep side.

Enabling multishot for sends is, again, identical to the receive side.
The app sets IORING_SEND_MULTISHOT in sqe->ioprio. This flag is also
the same as IORING_RECV_MULTISHOT.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/uapi/linux/io_uring.h |  8 +++
 io_uring/net.c                | 98 +++++++++++++++++++++++++++++------
 2 files changed, 89 insertions(+), 17 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 74c3afac9c63..6766e78ee03b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -351,9 +351,17 @@ enum io_uring_op {
  *				0 is reported if zerocopy was actually possible.
  *				IORING_NOTIF_USAGE_ZC_COPIED if data was copied
  *				(at least partially).
+ *
+ * IORING_SEND_MULTISHOT	Multishot send. Like the recv equivalent, must
+ *				be used with provided buffers. Keeps sending
+ *				from the given buffer group ID until it is
+ *				empty. Sets IORING_CQE_F_MORE if more
+ *				completions should be expected on behalf of
+ *				the same SQE.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
+#define IORING_SEND_MULTISHOT		IORING_RECV_MULTISHOT
 #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
 #define IORING_SEND_ZC_REPORT_USAGE	(1U << 3)
 
diff --git a/io_uring/net.c b/io_uring/net.c
index 30afb394efd7..8237ac5c957f 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -204,6 +204,16 @@ static int io_setup_async_msg(struct io_kiocb *req,
 	return -EAGAIN;
 }
 
+static inline void io_mshot_prep_retry(struct io_kiocb *req)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+
+	req->flags &= ~REQ_F_BL_EMPTY;
+	sr->done_io = 0;
+	sr->len = 0; /* get from the provided buffer */
+	req->buf_index = sr->buf_group;
+}
+
 static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
 {
 	int hdr;
@@ -401,6 +411,8 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
 	kfree(io->free_iov);
 }
 
+#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_SEND_MULTISHOT)
+
 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -417,11 +429,19 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (sr->flags & ~SENDMSG_FLAGS)
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
+	if (sr->flags & IORING_SEND_MULTISHOT) {
+		if (!(req->flags & REQ_F_BUFFER_SELECT))
+			return -EINVAL;
+		if (sr->msg_flags & MSG_WAITALL)
+			return -EINVAL;
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+		sr->buf_group = req->buf_index;
+	}
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -431,6 +451,44 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+static inline bool io_send_finish(struct io_kiocb *req, int *ret,
+				  struct msghdr *msg, unsigned issue_flags)
+{
+	bool mshot_finished = *ret <= 0;
+	unsigned int cflags;
+
+	cflags = io_put_kbuf(req, issue_flags);
+
+	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+		io_req_set_res(req, *ret, cflags);
+		*ret = IOU_OK;
+		return true;
+	}
+
+	if (mshot_finished || req->flags & REQ_F_BL_EMPTY)
+		goto finish;
+
+	/*
+	 * Fill CQE for this receive and see if we should keep trying to
+	 * receive from this socket.
+	 */
+	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+				*ret, cflags | IORING_CQE_F_MORE)) {
+		io_mshot_prep_retry(req);
+		*ret = IOU_ISSUE_SKIP_COMPLETE;
+		return false;
+	}
+
+	/* Otherwise stop multishot but use the current result. */
+finish:
+	io_req_set_res(req, *ret, cflags);
+	if (issue_flags & IO_URING_F_MULTISHOT)
+		*ret = IOU_STOP_MULTISHOT;
+	else
+		*ret = IOU_OK;
+	return true;
+}
+
 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -511,7 +569,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	struct sockaddr_storage __address;
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	size_t len = sr->len;
-	unsigned int cflags;
 	struct socket *sock;
 	struct msghdr msg;
 	unsigned flags;
@@ -542,10 +599,14 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_addr(req, &__address, issue_flags);
 
+	if (!io_check_multishot(req, issue_flags))
+		return -EAGAIN;
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 
@@ -570,8 +631,16 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	msg.msg_flags = flags;
 	ret = sock_sendmsg(sock, &msg);
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return io_setup_async_addr(req, &__address, issue_flags);
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) {
+			ret = io_setup_async_addr(req, &__address, issue_flags);
+			if (ret != -EAGAIN)
+				return ret;
+			if (issue_flags & IO_URING_F_MULTISHOT) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return -EAGAIN;
+		}
 
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->len -= ret;
@@ -588,9 +657,13 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
-	cflags = io_put_kbuf(req, issue_flags);
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+	else
+		io_kbuf_recycle(req, issue_flags);
+
+	if (!io_send_finish(req, &ret, &msg, issue_flags))
+		goto retry_multishot;
+
+	return ret;
 }
 
 int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -654,15 +727,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static inline void io_recv_prep_retry(struct io_kiocb *req)
-{
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-
-	sr->done_io = 0;
-	sr->len = 0; /* get from the provided buffer */
-	req->buf_index = sr->buf_group;
-}
-
 /*
  * Finishes io_recv and io_recvmsg.
  *
@@ -697,7 +761,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
 
-		io_recv_prep_retry(req);
+		io_mshot_prep_retry(req);
 		/* Known not-empty or unknown state, retry */
 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 7/8] io_uring/net: support multishot for sendmsg
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
                   ` (5 preceding siblings ...)
  2024-02-25  0:35 ` [PATCH 6/8] io_uring/net: support multishot for send Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-25  0:35 ` [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more Jens Axboe
  7 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Same as the IORING_OP_SEND multishot mode. Needs further work, but it's
functional and can be tested.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/net.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 8237ac5c957f..240b8eff1a78 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -494,7 +494,6 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
-	unsigned int cflags;
 	unsigned flags;
 	int min_ret = 0;
 	int ret;
@@ -517,6 +516,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return io_setup_async_msg(req, kmsg, issue_flags);
 
+	if (!io_check_multishot(req, issue_flags))
+		return io_setup_async_msg(req, kmsg, issue_flags);
+
+retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
 		size_t len = sr->len;
@@ -537,8 +540,14 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
 
 	if (ret < min_ret) {
-		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
-			return io_setup_async_msg(req, kmsg, issue_flags);
+		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) {
+			ret = io_setup_async_msg(req, kmsg, issue_flags);
+			if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
+				io_kbuf_recycle(req, issue_flags);
+				return IOU_ISSUE_SKIP_COMPLETE;
+			}
+			return ret;
+		}
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			kmsg->msg.msg_controllen = 0;
 			kmsg->msg.msg_control = NULL;
@@ -550,18 +559,22 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 			ret = -EINTR;
 		req_set_fail(req);
 	}
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
+	else
+		io_kbuf_recycle(req, issue_flags);
+
+	if (!io_send_finish(req, &ret, &kmsg->msg, issue_flags))
+		goto retry_multishot;
+
 	/* fast path, check for non-NULL to avoid function call */
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	io_netmsg_recycle(req, issue_flags);
-	if (ret >= 0)
-		ret += sr->done_io;
-	else if (sr->done_io)
-		ret = sr->done_io;
-	cflags = io_put_kbuf(req, issue_flags);
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+	return ret;
 }
 
 int io_send(struct io_kiocb *req, unsigned int issue_flags)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more
  2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
                   ` (6 preceding siblings ...)
  2024-02-25  0:35 ` [PATCH 7/8] io_uring/net: support multishot for sendmsg Jens Axboe
@ 2024-02-25  0:35 ` Jens Axboe
  2024-02-26 10:59   ` Dylan Yudaken
  7 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-25  0:35 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

If we have more data pending, we know we're going to do one more loop.
If that's the case, then set MSG_MORE to inform the networking stack
that there's more data coming shortly for this socket.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/net.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 240b8eff1a78..07307dd5a077 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -519,6 +519,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (!io_check_multishot(req, issue_flags))
 		return io_setup_async_msg(req, kmsg, issue_flags);
 
+	flags = sr->msg_flags;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+
 retry_multishot:
 	if (io_do_buffer_select(req)) {
 		void __user *buf;
@@ -528,12 +532,12 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 		if (!buf)
 			return -ENOBUFS;
 
+		if ((req->flags & (REQ_F_BL_EMPTY|REQ_F_APOLL_MULTISHOT)) ==
+				   REQ_F_APOLL_MULTISHOT)
+			flags |= MSG_MORE;
 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_SOURCE, buf, len);
 	}
 
-	flags = sr->msg_flags;
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		flags |= MSG_DONTWAIT;
 	if (flags & MSG_WAITALL)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-25  0:35 ` [PATCH 6/8] io_uring/net: support multishot for send Jens Axboe
@ 2024-02-26 10:47   ` Dylan Yudaken
  2024-02-26 13:38     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Dylan Yudaken @ 2024-02-26 10:47 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Sun, Feb 25, 2024 at 12:46 AM Jens Axboe <[email protected]> wrote:
>
> This works very much like the receive side, except for sends. The idea
> is that an application can fill outgoing buffers in a provided buffer
> group, and then arm a single send that will service them all. For now
> this variant just terminates when we are out of buffers to send, and
> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
> set, as per usual for multishot requests.
>

This feels to me a lot like just using OP_SEND with MSG_WAITALL as
described, unless I'm missing something?

I actually could imagine it being useful for the previous patches' use
case of queuing up sends and keeping ordering,
and I think the API is more obvious (rather than the second CQE
sending the first CQE's data). So maybe it's worth only
keeping one approach?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more
  2024-02-25  0:35 ` [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more Jens Axboe
@ 2024-02-26 10:59   ` Dylan Yudaken
  2024-02-26 13:42     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Dylan Yudaken @ 2024-02-26 10:59 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Sun, Feb 25, 2024 at 12:46 AM Jens Axboe <[email protected]> wrote:
>
> If we have more data pending, we know we're going to do one more loop.
> If that's the case, then set MSG_MORE to inform the networking stack
> that there's more data coming shortly for this socket.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>  io_uring/net.c | 10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/io_uring/net.c b/io_uring/net.c
> index 240b8eff1a78..07307dd5a077 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -519,6 +519,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>         if (!io_check_multishot(req, issue_flags))
>                 return io_setup_async_msg(req, kmsg, issue_flags);
>
> +       flags = sr->msg_flags;
> +       if (issue_flags & IO_URING_F_NONBLOCK)
> +               flags |= MSG_DONTWAIT;
> +
>  retry_multishot:
>         if (io_do_buffer_select(req)) {
>                 void __user *buf;
> @@ -528,12 +532,12 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>                 if (!buf)
>                         return -ENOBUFS;
>
> +               if ((req->flags & (REQ_F_BL_EMPTY|REQ_F_APOLL_MULTISHOT)) ==
> +                                  REQ_F_APOLL_MULTISHOT)
> +                       flags |= MSG_MORE;
>                 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_SOURCE, buf, len);
>         }

This feels racy. I don't have an exact sequence in mind, but I believe
there are cases where between
the two calls to __sys_sendmsg_sock, another submission could be
issued and drain the buffer list.
I guess the result would be that the packet is never sent out, but I
have not followed the codepaths of MSG_MORE.

The obvious other way to trigger this codepath is if the user messes
with the ring by decrementing
the buffer counter. I do not believe there are any nefarious outcomes
- but just to point out that
REQ_F_BL_EMPTY is essentially user controlled.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 10:47   ` Dylan Yudaken
@ 2024-02-26 13:38     ` Jens Axboe
  2024-02-26 14:02       ` Dylan Yudaken
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 13:38 UTC (permalink / raw)
  To: Dylan Yudaken; +Cc: io-uring

On 2/26/24 3:47 AM, Dylan Yudaken wrote:
> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>
>> This works very much like the receive side, except for sends. The idea
>> is that an application can fill outgoing buffers in a provided buffer
>> group, and then arm a single send that will service them all. For now
>> this variant just terminates when we are out of buffers to send, and
>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>> set, as per usual for multishot requests.
>>
> 
> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
> described, unless I'm missing something?

How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
try again" where multishot is "send data from this buffer group, and
keep sending data until it's empty". Hence it's the mirror of multishot
on the receive side. Unless I'm misunderstanding you somehow, not sure
it'd be smart to add special meaning to MSG_WAITALL with provided
buffers.

> I actually could imagine it being useful for the previous patches' use
> case of queuing up sends and keeping ordering,
> and I think the API is more obvious (rather than the second CQE
> sending the first CQE's data). So maybe it's worth only
> keeping one approach?

And here you totally lost me :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more
  2024-02-26 10:59   ` Dylan Yudaken
@ 2024-02-26 13:42     ` Jens Axboe
  2024-02-26 14:24       ` Pavel Begunkov
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 13:42 UTC (permalink / raw)
  To: Dylan Yudaken; +Cc: io-uring

On 2/26/24 3:59 AM, Dylan Yudaken wrote:
> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>
>> If we have more data pending, we know we're going to do one more loop.
>> If that's the case, then set MSG_MORE to inform the networking stack
>> that there's more data coming shortly for this socket.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
>> ---
>>  io_uring/net.c | 10 +++++++---
>>  1 file changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/io_uring/net.c b/io_uring/net.c
>> index 240b8eff1a78..07307dd5a077 100644
>> --- a/io_uring/net.c
>> +++ b/io_uring/net.c
>> @@ -519,6 +519,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>>         if (!io_check_multishot(req, issue_flags))
>>                 return io_setup_async_msg(req, kmsg, issue_flags);
>>
>> +       flags = sr->msg_flags;
>> +       if (issue_flags & IO_URING_F_NONBLOCK)
>> +               flags |= MSG_DONTWAIT;
>> +
>>  retry_multishot:
>>         if (io_do_buffer_select(req)) {
>>                 void __user *buf;
>> @@ -528,12 +532,12 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>>                 if (!buf)
>>                         return -ENOBUFS;
>>
>> +               if ((req->flags & (REQ_F_BL_EMPTY|REQ_F_APOLL_MULTISHOT)) ==
>> +                                  REQ_F_APOLL_MULTISHOT)
>> +                       flags |= MSG_MORE;
>>                 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_SOURCE, buf, len);
>>         }
> 
> This feels racy. I don't have an exact sequence in mind, but I believe
> there are cases where between
> the two calls to __sys_sendmsg_sock, another submission could be
> issued and drain the buffer list.
> I guess the result would be that the packet is never sent out, but I
> have not followed the codepaths of MSG_MORE.

This is true, but that race always exists depending on how gets to go
first (the adding of the buffer, or the send itself). The way I see it,
when the send is issued we're making the guarantee that we're going to
at least deplete the queue as it looks when entered. If more is added
while it's being processed, we _may_ see it.

Outside of that, we don't want it to potentially run in perpetuity. It
may actually be a good idea to make the rule of "just issue what was
there when first seen/issued" a hard one, though I don't think it's
really worth doing. But making any guarantees on buffers added in
parallel will be impossible. If you do that, then you have to deal with
figuring out what's left in the queue once you get a completion withou
CQE_F_MORE.

> The obvious other way to trigger this codepath is if the user messes
> with the ring by decrementing
> the buffer counter. I do not believe there are any nefarious outcomes
> - but just to point out that
> REQ_F_BL_EMPTY is essentially user controlled.

The user may certainly shoot himself in the foot. As long as that
doesn't lead to a nefarious outcome, then that's not a concern. For this
case, the head is kernel local, user can only write to the tail. So we
could have a case of user fiddling with the tail and when we grab the
next buffer (and the previous one did not have REQ_F_BL_EMPTY set), the
ring will indeed appear to be empty. At that point you get an -ENOBUFS
without CQE_F_MORE set.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 13:38     ` Jens Axboe
@ 2024-02-26 14:02       ` Dylan Yudaken
  2024-02-26 14:27         ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Dylan Yudaken @ 2024-02-26 14:02 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Mon, Feb 26, 2024 at 1:38 PM Jens Axboe <[email protected]> wrote:
>
> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
> > On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
> >>
> >> This works very much like the receive side, except for sends. The idea
> >> is that an application can fill outgoing buffers in a provided buffer
> >> group, and then arm a single send that will service them all. For now
> >> this variant just terminates when we are out of buffers to send, and
> >> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
> >> set, as per usual for multishot requests.
> >>
> >
> > This feels to me a lot like just using OP_SEND with MSG_WAITALL as
> > described, unless I'm missing something?
>
> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
> try again" where multishot is "send data from this buffer group, and
> keep sending data until it's empty". Hence it's the mirror of multishot
> on the receive side. Unless I'm misunderstanding you somehow, not sure
> it'd be smart to add special meaning to MSG_WAITALL with provided
> buffers.
>

_If_ you have the data upfront these are very similar, and only differ in that
the multishot approach will give you more granular progress updates.
My point was that this might not be a valuable API to people for only this
use case.

You do make a good point about MSG_WAITALL though - multishot send
doesn't really make sense to me without MSG_WAITALL semantics.
I cannot imagine a useful use case where the first buffer being partially sent
will still want the second buffer sent.

> > I actually could imagine it being useful for the previous patches' use
> > case of queuing up sends and keeping ordering,
> > and I think the API is more obvious (rather than the second CQE
> > sending the first CQE's data). So maybe it's worth only
> > keeping one approach?
>
> And here you totally lost me :-)

I am suggesting here that you don't really need to support buffer
lists on send without multishot.

It's a slightly confusing API (to me) that you queue PushBuffer(A),
Send(A), PushBuffer(B), Send(B)
and get back Res(B), Res(A) which are in fact in order A->B.

Instead you could queue up PushBuffer(A), Send(Multishot),
PushBuffer(B), and get back Res(Multishot), Res(Multishot)
which are in order A -> B.

The downside here is that userspace has to handle requeueing the SQE
if A completes before B is pushed. I leave it to you
if that is not desirable. I can see arguments for both sides.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more
  2024-02-26 13:42     ` Jens Axboe
@ 2024-02-26 14:24       ` Pavel Begunkov
  2024-02-26 14:52         ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-26 14:24 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken; +Cc: io-uring

On 2/26/24 13:42, Jens Axboe wrote:
> On 2/26/24 3:59 AM, Dylan Yudaken wrote:
>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>
>>> If we have more data pending, we know we're going to do one more loop.
>>> If that's the case, then set MSG_MORE to inform the networking stack
>>> that there's more data coming shortly for this socket.
>>>
>>> Signed-off-by: Jens Axboe <[email protected]>
>>> ---
>>>   io_uring/net.c | 10 +++++++---
>>>   1 file changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>> index 240b8eff1a78..07307dd5a077 100644
>>> --- a/io_uring/net.c
>>> +++ b/io_uring/net.c
>>> @@ -519,6 +519,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>>>          if (!io_check_multishot(req, issue_flags))
>>>                  return io_setup_async_msg(req, kmsg, issue_flags);
>>>
>>> +       flags = sr->msg_flags;
>>> +       if (issue_flags & IO_URING_F_NONBLOCK)
>>> +               flags |= MSG_DONTWAIT;
>>> +
>>>   retry_multishot:
>>>          if (io_do_buffer_select(req)) {
>>>                  void __user *buf;
>>> @@ -528,12 +532,12 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>>>                  if (!buf)
>>>                          return -ENOBUFS;
>>>
>>> +               if ((req->flags & (REQ_F_BL_EMPTY|REQ_F_APOLL_MULTISHOT)) ==
>>> +                                  REQ_F_APOLL_MULTISHOT)
>>> +                       flags |= MSG_MORE;
>>>                  iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_SOURCE, buf, len);
>>>          }
>>
>> This feels racy. I don't have an exact sequence in mind, but I believe
>> there are cases where between
>> the two calls to __sys_sendmsg_sock, another submission could be
>> issued and drain the buffer list.
>> I guess the result would be that the packet is never sent out, but I
>> have not followed the codepaths of MSG_MORE.
> 
> This is true, but that race always exists depending on how gets to go
> first (the adding of the buffer, or the send itself). The way I see it,
> when the send is issued we're making the guarantee that we're going to
> at least deplete the queue as it looks when entered. If more is added
> while it's being processed, we _may_ see it.
> 
> Outside of that, we don't want it to potentially run in perpetuity. It
> may actually be a good idea to make the rule of "just issue what was
> there when first seen/issued" a hard one, though I don't think it's
> really worth doing. But making any guarantees on buffers added in
> parallel will be impossible. If you do that, then you have to deal with
> figuring out what's left in the queue once you get a completion withou
> CQE_F_MORE.
> 
>> The obvious other way to trigger this codepath is if the user messes
>> with the ring by decrementing
>> the buffer counter. I do not believe there are any nefarious outcomes
>> - but just to point out that
>> REQ_F_BL_EMPTY is essentially user controlled.
> 
> The user may certainly shoot himself in the foot. As long as that
> doesn't lead to a nefarious outcome, then that's not a concern. For this
> case, the head is kernel local, user can only write to the tail. So we
> could have a case of user fiddling with the tail and when we grab the
> next buffer (and the previous one did not have REQ_F_BL_EMPTY set), the
> ring will indeed appear to be empty. At that point you get an -ENOBUFS
> without CQE_F_MORE set.

A side note, don't forget that there are other protocols apart
from TCP. AFAIK UDP corking will pack it into a single datagram,
which is not the same as two separate sends.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 14:02       ` Dylan Yudaken
@ 2024-02-26 14:27         ` Jens Axboe
  2024-02-26 14:36           ` Pavel Begunkov
  2024-02-26 19:31           ` Dylan Yudaken
  0 siblings, 2 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 14:27 UTC (permalink / raw)
  To: Dylan Yudaken; +Cc: io-uring

On 2/26/24 7:02 AM, Dylan Yudaken wrote:
> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe <[email protected]> wrote:
>>
>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>
>>>> This works very much like the receive side, except for sends. The idea
>>>> is that an application can fill outgoing buffers in a provided buffer
>>>> group, and then arm a single send that will service them all. For now
>>>> this variant just terminates when we are out of buffers to send, and
>>>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>>>> set, as per usual for multishot requests.
>>>>
>>>
>>> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
>>> described, unless I'm missing something?
>>
>> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
>> try again" where multishot is "send data from this buffer group, and
>> keep sending data until it's empty". Hence it's the mirror of multishot
>> on the receive side. Unless I'm misunderstanding you somehow, not sure
>> it'd be smart to add special meaning to MSG_WAITALL with provided
>> buffers.
>>
> 
> _If_ you have the data upfront these are very similar, and only differ
> in that the multishot approach will give you more granular progress
> updates. My point was that this might not be a valuable API to people
> for only this use case.

Not sure I agree, it feels like attributing a different meaning to
MSG_WAITALL if you use a provided buffer vs if you don't. And that to me
would seem to be confusing. Particularly when we have multishot on the
receive side, and this is identical, just for sends. Receives will keep
receiving as long as there are buffers in the provided group to receive
into, and sends will keep sending for the same condition. Either one
will terminate if we run out of buffers.

If you make MSG_WAITALL be that for provided buffers + send, then that
behaves differently than MSG_WAITALL with receive, and MSG_WAITALL with
send _without_ provided buffers. I don't think overloading an existing
flag for this purposes is a good idea, particularly when we already have
the existing semantics for multishot on the receive side.

> You do make a good point about MSG_WAITALL though - multishot send
> doesn't really make sense to me without MSG_WAITALL semantics. I
> cannot imagine a useful use case where the first buffer being
> partially sent will still want the second buffer sent.

Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
make it implied for multishot send. Currently the code doesn't deal with
that.

Maybe if MSG_WAITALL isn't set and we get a short send we don't set
CQE_F_MORE and we just stop. If it is set, then we go through the usual
retry logic. That would make it identical to MSG_WAITALL send without
multishot, which again is something I like in that we don't have
different behaviors depending on which mode we are using.

>>> I actually could imagine it being useful for the previous patches' use
>>> case of queuing up sends and keeping ordering,
>>> and I think the API is more obvious (rather than the second CQE
>>> sending the first CQE's data). So maybe it's worth only
>>> keeping one approach?
>>
>> And here you totally lost me :-)
> 
> I am suggesting here that you don't really need to support buffer
> lists on send without multishot.

That is certainly true, but I also don't see a reason _not_ to support
it. Again mostly because this is how receive and everything else works.
The app is free to issue a single SQE for send without multishot, and
pick the first buffer and send it.

> It's a slightly confusing API (to me) that you queue PushBuffer(A),
> Send(A), PushBuffer(B), Send(B)
> and get back Res(B), Res(A) which are in fact in order A->B.

Now I'm confused again. If you do do the above sequence, the first CQE
posted would be Res(A), and then Res(B)?

> Instead you could queue up PushBuffer(A), Send(Multishot),
> PushBuffer(B), and get back Res(Multishot), Res(Multishot)
> which are in order A -> B.

There should be no difference in ordering of the posted completion
between the two.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 14:27         ` Jens Axboe
@ 2024-02-26 14:36           ` Pavel Begunkov
  2024-02-26 15:16             ` Jens Axboe
  2024-02-26 19:31           ` Dylan Yudaken
  1 sibling, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-26 14:36 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken; +Cc: io-uring

On 2/26/24 14:27, Jens Axboe wrote:
> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe <[email protected]> wrote:
>>>
>>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>>
>>>>> This works very much like the receive side, except for sends. The idea
>>>>> is that an application can fill outgoing buffers in a provided buffer
>>>>> group, and then arm a single send that will service them all. For now
>>>>> this variant just terminates when we are out of buffers to send, and
>>>>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>>>>> set, as per usual for multishot requests.
>>>>>
>>>>
>>>> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
>>>> described, unless I'm missing something?
>>>
>>> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
>>> try again" where multishot is "send data from this buffer group, and
>>> keep sending data until it's empty". Hence it's the mirror of multishot
>>> on the receive side. Unless I'm misunderstanding you somehow, not sure
>>> it'd be smart to add special meaning to MSG_WAITALL with provided
>>> buffers.
>>>
>>
>> _If_ you have the data upfront these are very similar, and only differ
>> in that the multishot approach will give you more granular progress
>> updates. My point was that this might not be a valuable API to people
>> for only this use case.
> 
> Not sure I agree, it feels like attributing a different meaning to
> MSG_WAITALL if you use a provided buffer vs if you don't. And that to me
> would seem to be confusing. Particularly when we have multishot on the
> receive side, and this is identical, just for sends. Receives will keep
> receiving as long as there are buffers in the provided group to receive
> into, and sends will keep sending for the same condition. Either one
> will terminate if we run out of buffers.
> 
> If you make MSG_WAITALL be that for provided buffers + send, then that
> behaves differently than MSG_WAITALL with receive, and MSG_WAITALL with
> send _without_ provided buffers. I don't think overloading an existing
> flag for this purposes is a good idea, particularly when we already have
> the existing semantics for multishot on the receive side.

I'm actually with Dylan on that and wonder where the perf win
could come from. Let's assume TCP, sends are usually completed
in the same syscall, otherwise your pacing is just bad. Thrift,
for example, collects sends and packs into one multi iov request
during a loop iteration. If the req completes immediately then
the userspace just wouldn't have time to push more buffers by
definition (assuming single threading).

If you actually need to poll tx, you send a request and collect
data into iov in userspace in background. When the request
completes you send all that in batch. You can probably find
a niche example when batch=1 in this case, but I don't think
anyone would care.

The example doesn't use multi-iov, and also still has to
serialise requests, which naturally serialises buffer consumption
w/o provided bufs.

>> You do make a good point about MSG_WAITALL though - multishot send
>> doesn't really make sense to me without MSG_WAITALL semantics. I
>> cannot imagine a useful use case where the first buffer being
>> partially sent will still want the second buffer sent.
> 
> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
> make it implied for multishot send. Currently the code doesn't deal with
> that.
> 
> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
> CQE_F_MORE and we just stop. If it is set, then we go through the usual
> retry logic. That would make it identical to MSG_WAITALL send without
> multishot, which again is something I like in that we don't have
> different behaviors depending on which mode we are using.
> 
>>>> I actually could imagine it being useful for the previous patches' use
>>>> case of queuing up sends and keeping ordering,
>>>> and I think the API is more obvious (rather than the second CQE
>>>> sending the first CQE's data). So maybe it's worth only
>>>> keeping one approach?
>>>
>>> And here you totally lost me :-)
>>
>> I am suggesting here that you don't really need to support buffer
>> lists on send without multishot.
> 
> That is certainly true, but I also don't see a reason _not_ to support
> it. Again mostly because this is how receive and everything else works.
> The app is free to issue a single SQE for send without multishot, and
> pick the first buffer and send it.

Multishot sound interesting, but I don't see it much useful if
you terminate when there are no buffers. Otherwise, if it continues
to sit in, someone would have to wake it up

>> It's a slightly confusing API (to me) that you queue PushBuffer(A),
>> Send(A), PushBuffer(B), Send(B)
>> and get back Res(B), Res(A) which are in fact in order A->B.
> 
> Now I'm confused again. If you do do the above sequence, the first CQE
> posted would be Res(A), and then Res(B)?
> 
>> Instead you could queue up PushBuffer(A), Send(Multishot),
>> PushBuffer(B), and get back Res(Multishot), Res(Multishot)
>> which are in order A -> B.
> 
> There should be no difference in ordering of the posted completion
> between the two.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr
  2024-02-25  0:35 ` [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr Jens Axboe
@ 2024-02-26 14:41   ` Pavel Begunkov
  2024-02-26 15:03     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-26 14:41 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 2/25/24 00:35, Jens Axboe wrote:
> For recvmsg, we roll our own since we support buffer selections. This
> isn't the case for sendmsg right now, but in preparation for doing so,
> make the recvmsg copy helpers generic so we can call them from the
> sendmsg side as well.

I thought about it before, definitely like the idea, I think
patches 1-2 can probably get merged first.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more
  2024-02-26 14:24       ` Pavel Begunkov
@ 2024-02-26 14:52         ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 14:52 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/26/24 7:24 AM, Pavel Begunkov wrote:
> On 2/26/24 13:42, Jens Axboe wrote:
>> On 2/26/24 3:59 AM, Dylan Yudaken wrote:
>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>
>>>> If we have more data pending, we know we're going to do one more loop.
>>>> If that's the case, then set MSG_MORE to inform the networking stack
>>>> that there's more data coming shortly for this socket.
>>>>
>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>> ---
>>>>   io_uring/net.c | 10 +++++++---
>>>>   1 file changed, 7 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>>> index 240b8eff1a78..07307dd5a077 100644
>>>> --- a/io_uring/net.c
>>>> +++ b/io_uring/net.c
>>>> @@ -519,6 +519,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>>>>          if (!io_check_multishot(req, issue_flags))
>>>>                  return io_setup_async_msg(req, kmsg, issue_flags);
>>>>
>>>> +       flags = sr->msg_flags;
>>>> +       if (issue_flags & IO_URING_F_NONBLOCK)
>>>> +               flags |= MSG_DONTWAIT;
>>>> +
>>>>   retry_multishot:
>>>>          if (io_do_buffer_select(req)) {
>>>>                  void __user *buf;
>>>> @@ -528,12 +532,12 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
>>>>                  if (!buf)
>>>>                          return -ENOBUFS;
>>>>
>>>> +               if ((req->flags & (REQ_F_BL_EMPTY|REQ_F_APOLL_MULTISHOT)) ==
>>>> +                                  REQ_F_APOLL_MULTISHOT)
>>>> +                       flags |= MSG_MORE;
>>>>                  iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_SOURCE, buf, len);
>>>>          }
>>>
>>> This feels racy. I don't have an exact sequence in mind, but I believe
>>> there are cases where between
>>> the two calls to __sys_sendmsg_sock, another submission could be
>>> issued and drain the buffer list.
>>> I guess the result would be that the packet is never sent out, but I
>>> have not followed the codepaths of MSG_MORE.
>>
>> This is true, but that race always exists depending on how gets to go
>> first (the adding of the buffer, or the send itself). The way I see it,
>> when the send is issued we're making the guarantee that we're going to
>> at least deplete the queue as it looks when entered. If more is added
>> while it's being processed, we _may_ see it.
>>
>> Outside of that, we don't want it to potentially run in perpetuity. It
>> may actually be a good idea to make the rule of "just issue what was
>> there when first seen/issued" a hard one, though I don't think it's
>> really worth doing. But making any guarantees on buffers added in
>> parallel will be impossible. If you do that, then you have to deal with
>> figuring out what's left in the queue once you get a completion withou
>> CQE_F_MORE.
>>
>>> The obvious other way to trigger this codepath is if the user messes
>>> with the ring by decrementing
>>> the buffer counter. I do not believe there are any nefarious outcomes
>>> - but just to point out that
>>> REQ_F_BL_EMPTY is essentially user controlled.
>>
>> The user may certainly shoot himself in the foot. As long as that
>> doesn't lead to a nefarious outcome, then that's not a concern. For this
>> case, the head is kernel local, user can only write to the tail. So we
>> could have a case of user fiddling with the tail and when we grab the
>> next buffer (and the previous one did not have REQ_F_BL_EMPTY set), the
>> ring will indeed appear to be empty. At that point you get an -ENOBUFS
>> without CQE_F_MORE set.
> 
> A side note, don't forget that there are other protocols apart
> from TCP. AFAIK UDP corking will pack it into a single datagram,
> which is not the same as two separate sends.

Yeah, should really have labeled this one as a test/rfc kind of patch. I
wasn't even convinced we want to do this uncondtionally for TCP. I'll
just leave it at the end for now, it's a separate kind of discussion
imho and this is why it was left as a separate patch rather than being
bundled with the multishot send in general.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr
  2024-02-26 14:41   ` Pavel Begunkov
@ 2024-02-26 15:03     ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 15:03 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/26/24 7:41 AM, Pavel Begunkov wrote:
> On 2/25/24 00:35, Jens Axboe wrote:
>> For recvmsg, we roll our own since we support buffer selections. This
>> isn't the case for sendmsg right now, but in preparation for doing so,
>> make the recvmsg copy helpers generic so we can call them from the
>> sendmsg side as well.
> 
> I thought about it before, definitely like the idea, I think
> patches 1-2 can probably get merged first.

Yep, I'll queue 1 right now, and then just send patch 2 to netdev once
patch 1 lands upstream. There's no rush on patch 2, half of it could
have gone in months ago in fact.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 14:36           ` Pavel Begunkov
@ 2024-02-26 15:16             ` Jens Axboe
  2024-02-26 15:41               ` Pavel Begunkov
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 15:16 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/26/24 7:36 AM, Pavel Begunkov wrote:
> On 2/26/24 14:27, Jens Axboe wrote:
>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe <[email protected]> wrote:
>>>>
>>>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>>>
>>>>>> This works very much like the receive side, except for sends. The idea
>>>>>> is that an application can fill outgoing buffers in a provided buffer
>>>>>> group, and then arm a single send that will service them all. For now
>>>>>> this variant just terminates when we are out of buffers to send, and
>>>>>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>>>>>> set, as per usual for multishot requests.
>>>>>>
>>>>>
>>>>> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
>>>>> described, unless I'm missing something?
>>>>
>>>> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
>>>> try again" where multishot is "send data from this buffer group, and
>>>> keep sending data until it's empty". Hence it's the mirror of multishot
>>>> on the receive side. Unless I'm misunderstanding you somehow, not sure
>>>> it'd be smart to add special meaning to MSG_WAITALL with provided
>>>> buffers.
>>>>
>>>
>>> _If_ you have the data upfront these are very similar, and only differ
>>> in that the multishot approach will give you more granular progress
>>> updates. My point was that this might not be a valuable API to people
>>> for only this use case.
>>
>> Not sure I agree, it feels like attributing a different meaning to
>> MSG_WAITALL if you use a provided buffer vs if you don't. And that to me
>> would seem to be confusing. Particularly when we have multishot on the
>> receive side, and this is identical, just for sends. Receives will keep
>> receiving as long as there are buffers in the provided group to receive
>> into, and sends will keep sending for the same condition. Either one
>> will terminate if we run out of buffers.
>>
>> If you make MSG_WAITALL be that for provided buffers + send, then that
>> behaves differently than MSG_WAITALL with receive, and MSG_WAITALL with
>> send _without_ provided buffers. I don't think overloading an existing
>> flag for this purposes is a good idea, particularly when we already have
>> the existing semantics for multishot on the receive side.
> 
> I'm actually with Dylan on that and wonder where the perf win
> could come from. Let's assume TCP, sends are usually completed
> in the same syscall, otherwise your pacing is just bad. Thrift,
> for example, collects sends and packs into one multi iov request
> during a loop iteration. If the req completes immediately then
> the userspace just wouldn't have time to push more buffers by
> definition (assuming single threading).

The problem only occurs when they don't complete inline, and now you get
reordering. The application could of course attempt to do proper pacing
and see if it can avoid that condition. If not, it now needs to
serialize sends. Using provided buffers makes this very easy, as you
don't need to care about it at all, and it eliminates complexity in the
application dealing with this.

> If you actually need to poll tx, you send a request and collect
> data into iov in userspace in background. When the request
> completes you send all that in batch. You can probably find
> a niche example when batch=1 in this case, but I don't think
> anyone would care.
> 
> The example doesn't use multi-iov, and also still has to
> serialise requests, which naturally serialises buffer consumption
> w/o provided bufs.

IMHO there's no reason NOT to have both a send with provided buffers and
a multishot send. The alternative would be to have send-N, where you
pass in N. But I don't see much point to that over "just drain the whole
pending list". The obvious use case is definitely send multishot, but
what would the reasoning be to prohibit pacing by explicitly disallowing
only doing a single buffer (or a partial queue)? As mentioned earlier, I
like keeping the symmetry with the receive side for multishot, and not
make it any different unless there's a reason to.

>>> You do make a good point about MSG_WAITALL though - multishot send
>>> doesn't really make sense to me without MSG_WAITALL semantics. I
>>> cannot imagine a useful use case where the first buffer being
>>> partially sent will still want the second buffer sent.
>>
>> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
>> make it implied for multishot send. Currently the code doesn't deal with
>> that.
>>
>> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
>> CQE_F_MORE and we just stop. If it is set, then we go through the usual
>> retry logic. That would make it identical to MSG_WAITALL send without
>> multishot, which again is something I like in that we don't have
>> different behaviors depending on which mode we are using.
>>
>>>>> I actually could imagine it being useful for the previous patches' use
>>>>> case of queuing up sends and keeping ordering,
>>>>> and I think the API is more obvious (rather than the second CQE
>>>>> sending the first CQE's data). So maybe it's worth only
>>>>> keeping one approach?
>>>>
>>>> And here you totally lost me :-)
>>>
>>> I am suggesting here that you don't really need to support buffer
>>> lists on send without multishot.
>>
>> That is certainly true, but I also don't see a reason _not_ to support
>> it. Again mostly because this is how receive and everything else works.
>> The app is free to issue a single SQE for send without multishot, and
>> pick the first buffer and send it.
> 
> Multishot sound interesting, but I don't see it much useful if
> you terminate when there are no buffers. Otherwise, if it continues
> to sit in, someone would have to wake it up

I did think about the termination case, and the problem is that if there
are no buffers, you need it to wake when there are buffers. And at that
point you may as well just do another send, as you need the application
to trigger it. The alternative would be to invent a way to trigger that
wakeup, which would be send only and weird just because of that.

If you end up poll armed because the socket buffer is full, then it
certainly makes sense to stay armed as there's a natural trigger there
when that condition resolves.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 15:16             ` Jens Axboe
@ 2024-02-26 15:41               ` Pavel Begunkov
  2024-02-26 19:11                 ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-26 15:41 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken; +Cc: io-uring

On 2/26/24 15:16, Jens Axboe wrote:
> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>> On 2/26/24 14:27, Jens Axboe wrote:
>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe <[email protected]> wrote:
>>>>>
>>>>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>>>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>>>>
>>>>>>> This works very much like the receive side, except for sends. The idea
>>>>>>> is that an application can fill outgoing buffers in a provided buffer
>>>>>>> group, and then arm a single send that will service them all. For now
>>>>>>> this variant just terminates when we are out of buffers to send, and
>>>>>>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>>>>>>> set, as per usual for multishot requests.
>>>>>>>
>>>>>>
>>>>>> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
>>>>>> described, unless I'm missing something?
>>>>>
>>>>> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
>>>>> try again" where multishot is "send data from this buffer group, and
>>>>> keep sending data until it's empty". Hence it's the mirror of multishot
>>>>> on the receive side. Unless I'm misunderstanding you somehow, not sure
>>>>> it'd be smart to add special meaning to MSG_WAITALL with provided
>>>>> buffers.
>>>>>
>>>>
>>>> _If_ you have the data upfront these are very similar, and only differ
>>>> in that the multishot approach will give you more granular progress
>>>> updates. My point was that this might not be a valuable API to people
>>>> for only this use case.
>>>
>>> Not sure I agree, it feels like attributing a different meaning to
>>> MSG_WAITALL if you use a provided buffer vs if you don't. And that to me
>>> would seem to be confusing. Particularly when we have multishot on the
>>> receive side, and this is identical, just for sends. Receives will keep
>>> receiving as long as there are buffers in the provided group to receive
>>> into, and sends will keep sending for the same condition. Either one
>>> will terminate if we run out of buffers.
>>>
>>> If you make MSG_WAITALL be that for provided buffers + send, then that
>>> behaves differently than MSG_WAITALL with receive, and MSG_WAITALL with
>>> send _without_ provided buffers. I don't think overloading an existing
>>> flag for this purposes is a good idea, particularly when we already have
>>> the existing semantics for multishot on the receive side.
>>
>> I'm actually with Dylan on that and wonder where the perf win
>> could come from. Let's assume TCP, sends are usually completed
>> in the same syscall, otherwise your pacing is just bad. Thrift,
>> for example, collects sends and packs into one multi iov request
>> during a loop iteration. If the req completes immediately then
>> the userspace just wouldn't have time to push more buffers by
>> definition (assuming single threading).
> 
> The problem only occurs when they don't complete inline, and now you get
> reordering. The application could of course attempt to do proper pacing
> and see if it can avoid that condition. If not, it now needs to

Ok, I admit that there are more than valid cases when artificial pacing
is not an option, which is why I also laid out the polling case.
Let's also say that limits potential perf wins to streaming and very
large transfers (like files), not "lots of relatively small
request-response" kinds of apps.

> serialize sends. Using provided buffers makes this very easy, as you
> don't need to care about it at all, and it eliminates complexity in the
> application dealing with this.

If I'm correct the example also serialises sends(?). I don't
think it's that simpler. You batch, you send. Same with this,
but batch into a provided buffer and the send is conditional.

Another downside is that you need a provided queue per socket,
which sounds pretty expensive for 100s if not 1000s socket
apps.

>> If you actually need to poll tx, you send a request and collect
>> data into iov in userspace in background. When the request
>> completes you send all that in batch. You can probably find
>> a niche example when batch=1 in this case, but I don't think
>> anyone would care.
>>
>> The example doesn't use multi-iov, and also still has to
>> serialise requests, which naturally serialises buffer consumption
>> w/o provided bufs.
> 
> IMHO there's no reason NOT to have both a send with provided buffers and
> a multishot send. The alternative would be to have send-N, where you
> pass in N. But I don't see much point to that over "just drain the whole
> pending list". The obvious use case is definitely send multishot, but

Not sure I follow, but in all cases I was contemplating about
you sends everything you have at the moment.

> what would the reasoning be to prohibit pacing by explicitly disallowing
> only doing a single buffer (or a partial queue)? As mentioned earlier, I
> like keeping the symmetry with the receive side for multishot, and not
> make it any different unless there's a reason to.

There are different, buffer content kernel (rx) vs userspace (tx)
provided, provided queue / group per socket vs shared. Wake ups
for multishots as per below. It's not like it's a one line change,
so IMHO requires to be giving some benefits.

>>>> You do make a good point about MSG_WAITALL though - multishot send
>>>> doesn't really make sense to me without MSG_WAITALL semantics. I
>>>> cannot imagine a useful use case where the first buffer being
>>>> partially sent will still want the second buffer sent.
>>>
>>> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
>>> make it implied for multishot send. Currently the code doesn't deal with
>>> that.
>>>
>>> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
>>> CQE_F_MORE and we just stop. If it is set, then we go through the usual
>>> retry logic. That would make it identical to MSG_WAITALL send without
>>> multishot, which again is something I like in that we don't have
>>> different behaviors depending on which mode we are using.
>>>
>>>>>> I actually could imagine it being useful for the previous patches' use
>>>>>> case of queuing up sends and keeping ordering,
>>>>>> and I think the API is more obvious (rather than the second CQE
>>>>>> sending the first CQE's data). So maybe it's worth only
>>>>>> keeping one approach?
>>>>>
>>>>> And here you totally lost me :-)
>>>>
>>>> I am suggesting here that you don't really need to support buffer
>>>> lists on send without multishot.
>>>
>>> That is certainly true, but I also don't see a reason _not_ to support
>>> it. Again mostly because this is how receive and everything else works.
>>> The app is free to issue a single SQE for send without multishot, and
>>> pick the first buffer and send it.
>>
>> Multishot sound interesting, but I don't see it much useful if
>> you terminate when there are no buffers. Otherwise, if it continues
>> to sit in, someone would have to wake it up
> 
> I did think about the termination case, and the problem is that if there
> are no buffers, you need it to wake when there are buffers. And at that
> point you may as well just do another send, as you need the application
> to trigger it. The alternative would be to invent a way to trigger that
> wakeup, which would be send only and weird just because of that.

Yeah, that's the point, wake ups would be userspace driven, and how
to do it without heavy stuff like syscalls is not so clear.

> If you end up poll armed because the socket buffer is full, then it
> certainly makes sense to stay armed as there's a natural trigger there
> when that condition resolves.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 15:41               ` Pavel Begunkov
@ 2024-02-26 19:11                 ` Jens Axboe
  2024-02-26 19:21                   ` Pavel Begunkov
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 19:11 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/26/24 8:41 AM, Pavel Begunkov wrote:
> On 2/26/24 15:16, Jens Axboe wrote:
>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe <[email protected]> wrote:
>>>>>>
>>>>>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>>>>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>>>>>
>>>>>>>> This works very much like the receive side, except for sends. The idea
>>>>>>>> is that an application can fill outgoing buffers in a provided buffer
>>>>>>>> group, and then arm a single send that will service them all. For now
>>>>>>>> this variant just terminates when we are out of buffers to send, and
>>>>>>>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>>>>>>>> set, as per usual for multishot requests.
>>>>>>>>
>>>>>>>
>>>>>>> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
>>>>>>> described, unless I'm missing something?
>>>>>>
>>>>>> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
>>>>>> try again" where multishot is "send data from this buffer group, and
>>>>>> keep sending data until it's empty". Hence it's the mirror of multishot
>>>>>> on the receive side. Unless I'm misunderstanding you somehow, not sure
>>>>>> it'd be smart to add special meaning to MSG_WAITALL with provided
>>>>>> buffers.
>>>>>>
>>>>>
>>>>> _If_ you have the data upfront these are very similar, and only differ
>>>>> in that the multishot approach will give you more granular progress
>>>>> updates. My point was that this might not be a valuable API to people
>>>>> for only this use case.
>>>>
>>>> Not sure I agree, it feels like attributing a different meaning to
>>>> MSG_WAITALL if you use a provided buffer vs if you don't. And that to me
>>>> would seem to be confusing. Particularly when we have multishot on the
>>>> receive side, and this is identical, just for sends. Receives will keep
>>>> receiving as long as there are buffers in the provided group to receive
>>>> into, and sends will keep sending for the same condition. Either one
>>>> will terminate if we run out of buffers.
>>>>
>>>> If you make MSG_WAITALL be that for provided buffers + send, then that
>>>> behaves differently than MSG_WAITALL with receive, and MSG_WAITALL with
>>>> send _without_ provided buffers. I don't think overloading an existing
>>>> flag for this purposes is a good idea, particularly when we already have
>>>> the existing semantics for multishot on the receive side.
>>>
>>> I'm actually with Dylan on that and wonder where the perf win
>>> could come from. Let's assume TCP, sends are usually completed
>>> in the same syscall, otherwise your pacing is just bad. Thrift,
>>> for example, collects sends and packs into one multi iov request
>>> during a loop iteration. If the req completes immediately then
>>> the userspace just wouldn't have time to push more buffers by
>>> definition (assuming single threading).
>>
>> The problem only occurs when they don't complete inline, and now you get
>> reordering. The application could of course attempt to do proper pacing
>> and see if it can avoid that condition. If not, it now needs to
> 
> Ok, I admit that there are more than valid cases when artificial pacing
> is not an option, which is why I also laid out the polling case.
> Let's also say that limits potential perf wins to streaming and very
> large transfers (like files), not "lots of relatively small
> request-response" kinds of apps.

I don't think that's true - if you're doing large streaming, you're more
likely to keep the socket buffer full, whereas for smallish sends, it's
less likely to be full. Testing with the silly proxy confirms that. And
outside of cases where pacing just isn't feasible, it's extra overhead
for cases where you potentially could or what. To me, the main appeal of
this is the simplicity.

>> serialize sends. Using provided buffers makes this very easy, as you
>> don't need to care about it at all, and it eliminates complexity in the
>> application dealing with this.
> 
> If I'm correct the example also serialises sends(?). I don't
> think it's that simpler. You batch, you send. Same with this,
> but batch into a provided buffer and the send is conditional.

Do you mean the proxy example? Just want to be sure we're talking about
the same thing. Yes it has to serialize sends, because otherwise we can
run into the condition described in the patch that adds provided buffer
support for send. But I did bench multishot separately from there,
here's some of it:

10G network, 3 hosts, 1 acting as a mirror proxy shuffling N-byte packets.
Send ring and send multishot not used:

Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw
=====================================================
1000   |    No	   |  No   |   437  | 1.22M | 9598M
32     |    No	   |  No   |  5856  | 2.87M |  734M

Same test, now turn on send ring:

Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
===========================================================
1000   |    Yes	   |  No   |   436  | 1.23M | 9620M | + 0.2%
32     |    Yes	   |  No   |  3462  | 4.85M | 1237M | +68.5%

Same test, now turn on send mshot as well:

Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
===========================================================
1000   |    Yes	   |  Yes  |   436  | 1.23M | 9620M | + 0.2%
32     |    Yes	   |  Yes  |  3125  | 5.37M | 1374M | +87.2%

which does show that there's another win on top for just queueing these
sends and doing a single send to handle them, rather than needing to
prepare a send for each buffer. Part of that may be that you simply run
out of SQEs and then have to submit regardless of where you are at.

> Another downside is that you need a provided queue per socket,
> which sounds pretty expensive for 100s if not 1000s socket
> apps.

That's certainly true. But either you need backlog per socket anyway in
the app, or you only send single buffers anyway (in a typical
request/response kind of fashion) between receives and you don't need it
at all.

>>> If you actually need to poll tx, you send a request and collect
>>> data into iov in userspace in background. When the request
>>> completes you send all that in batch. You can probably find
>>> a niche example when batch=1 in this case, but I don't think
>>> anyone would care.
>>>
>>> The example doesn't use multi-iov, and also still has to
>>> serialise requests, which naturally serialises buffer consumption
>>> w/o provided bufs.
>>
>> IMHO there's no reason NOT to have both a send with provided buffers and
>> a multishot send. The alternative would be to have send-N, where you
>> pass in N. But I don't see much point to that over "just drain the whole
>> pending list". The obvious use case is definitely send multishot, but
> 
> Not sure I follow, but in all cases I was contemplating about
> you sends everything you have at the moment.
> 
>> what would the reasoning be to prohibit pacing by explicitly disallowing
>> only doing a single buffer (or a partial queue)? As mentioned earlier, I
>> like keeping the symmetry with the receive side for multishot, and not
>> make it any different unless there's a reason to.
> 
> There are different, buffer content kernel (rx) vs userspace (tx)
> provided, provided queue / group per socket vs shared. Wake ups
> for multishots as per below. It's not like it's a one line change,
> so IMHO requires to be giving some benefits.

Are you talking about provided buffers, or multishot specifically? I
think both are standalone pretty much as simple as they can be. And if
the argument is "just have send with provided buffers be multishot by
default", then that single patch is basically the two patches combined.
There's no simplification there. Outside of a strong argument for why it
would never make sense to do single shot send with provided buffers, I
really don't want to combine them into one single action.

>>>>> You do make a good point about MSG_WAITALL though - multishot send
>>>>> doesn't really make sense to me without MSG_WAITALL semantics. I
>>>>> cannot imagine a useful use case where the first buffer being
>>>>> partially sent will still want the second buffer sent.
>>>>
>>>> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
>>>> make it implied for multishot send. Currently the code doesn't deal with
>>>> that.
>>>>
>>>> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
>>>> CQE_F_MORE and we just stop. If it is set, then we go through the usual
>>>> retry logic. That would make it identical to MSG_WAITALL send without
>>>> multishot, which again is something I like in that we don't have
>>>> different behaviors depending on which mode we are using.
>>>>
>>>>>>> I actually could imagine it being useful for the previous patches' use
>>>>>>> case of queuing up sends and keeping ordering,
>>>>>>> and I think the API is more obvious (rather than the second CQE
>>>>>>> sending the first CQE's data). So maybe it's worth only
>>>>>>> keeping one approach?
>>>>>>
>>>>>> And here you totally lost me :-)
>>>>>
>>>>> I am suggesting here that you don't really need to support buffer
>>>>> lists on send without multishot.
>>>>
>>>> That is certainly true, but I also don't see a reason _not_ to support
>>>> it. Again mostly because this is how receive and everything else works.
>>>> The app is free to issue a single SQE for send without multishot, and
>>>> pick the first buffer and send it.
>>>
>>> Multishot sound interesting, but I don't see it much useful if
>>> you terminate when there are no buffers. Otherwise, if it continues
>>> to sit in, someone would have to wake it up
>>
>> I did think about the termination case, and the problem is that if there
>> are no buffers, you need it to wake when there are buffers. And at that
>> point you may as well just do another send, as you need the application
>> to trigger it. The alternative would be to invent a way to trigger that
>> wakeup, which would be send only and weird just because of that.
> 
> Yeah, that's the point, wake ups would be userspace driven, and how
> to do it without heavy stuff like syscalls is not so clear.

It's just not possible without eg polling, either directly or using some
monitor/mwait arch specific thing which would be awful. Or by doing some
manual wakeup, which would need to lookup and kick the request, which I
bet would be worse than just re-arming the send multishot.

If you could poll trigger it somehow, it also further complicates things
as now it could potentially happen at any time. As it stands, the app
knows when a poll multishot is armed (and submitted, or not), and can
serialize with the outgoing buffer queue trivially.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 19:11                 ` Jens Axboe
@ 2024-02-26 19:21                   ` Pavel Begunkov
  2024-02-26 20:12                     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-26 19:21 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken; +Cc: io-uring

On 2/26/24 19:11, Jens Axboe wrote:
> On 2/26/24 8:41 AM, Pavel Begunkov wrote:
>> On 2/26/24 15:16, Jens Axboe wrote:
>>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe <[email protected]> wrote:
>>>>>>>
>>>>>>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>>>>>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe <[email protected]> wrote:
>>>>>>>>>
>>>>>>>>> This works very much like the receive side, except for sends. The idea
>>>>>>>>> is that an application can fill outgoing buffers in a provided buffer
>>>>>>>>> group, and then arm a single send that will service them all. For now
>>>>>>>>> this variant just terminates when we are out of buffers to send, and
>>>>>>>>> hence the application needs to re-arm it if IORING_CQE_F_MORE isn't
>>>>>>>>> set, as per usual for multishot requests.
>>>>>>>>>
>>>>>>>>
>>>>>>>> This feels to me a lot like just using OP_SEND with MSG_WAITALL as
>>>>>>>> described, unless I'm missing something?
>>>>>>>
>>>>>>> How so? MSG_WAITALL is "send X amount of data, and if it's a short send,
>>>>>>> try again" where multishot is "send data from this buffer group, and
>>>>>>> keep sending data until it's empty". Hence it's the mirror of multishot
>>>>>>> on the receive side. Unless I'm misunderstanding you somehow, not sure
>>>>>>> it'd be smart to add special meaning to MSG_WAITALL with provided
>>>>>>> buffers.
>>>>>>>
>>>>>>
>>>>>> _If_ you have the data upfront these are very similar, and only differ
>>>>>> in that the multishot approach will give you more granular progress
>>>>>> updates. My point was that this might not be a valuable API to people
>>>>>> for only this use case.
>>>>>
>>>>> Not sure I agree, it feels like attributing a different meaning to
>>>>> MSG_WAITALL if you use a provided buffer vs if you don't. And that to me
>>>>> would seem to be confusing. Particularly when we have multishot on the
>>>>> receive side, and this is identical, just for sends. Receives will keep
>>>>> receiving as long as there are buffers in the provided group to receive
>>>>> into, and sends will keep sending for the same condition. Either one
>>>>> will terminate if we run out of buffers.
>>>>>
>>>>> If you make MSG_WAITALL be that for provided buffers + send, then that
>>>>> behaves differently than MSG_WAITALL with receive, and MSG_WAITALL with
>>>>> send _without_ provided buffers. I don't think overloading an existing
>>>>> flag for this purposes is a good idea, particularly when we already have
>>>>> the existing semantics for multishot on the receive side.
>>>>
>>>> I'm actually with Dylan on that and wonder where the perf win
>>>> could come from. Let's assume TCP, sends are usually completed
>>>> in the same syscall, otherwise your pacing is just bad. Thrift,
>>>> for example, collects sends and packs into one multi iov request
>>>> during a loop iteration. If the req completes immediately then
>>>> the userspace just wouldn't have time to push more buffers by
>>>> definition (assuming single threading).
>>>
>>> The problem only occurs when they don't complete inline, and now you get
>>> reordering. The application could of course attempt to do proper pacing
>>> and see if it can avoid that condition. If not, it now needs to
>>
>> Ok, I admit that there are more than valid cases when artificial pacing
>> is not an option, which is why I also laid out the polling case.
>> Let's also say that limits potential perf wins to streaming and very
>> large transfers (like files), not "lots of relatively small
>> request-response" kinds of apps.
> 
> I don't think that's true - if you're doing large streaming, you're more
> likely to keep the socket buffer full, whereas for smallish sends, it's
> less likely to be full. Testing with the silly proxy confirms that. And

I don't see any contradiction to what I said. With streaming/large
sends it's more likely to be polled. For small sends and
send-receive-send-... patterns the sock queue is unlikely to be
full, in which case the send is processed inline, and so the
feature doesn't add performance, as you agreed a couple email
before.

> outside of cases where pacing just isn't feasible, it's extra overhead
> for cases where you potentially could or what.

I lost it, what overhead?

> To me, the main appeal of this is the simplicity.

I'd argue it doesn't seem any simpler than the alternative.

>>> serialize sends. Using provided buffers makes this very easy, as you
>>> don't need to care about it at all, and it eliminates complexity in the
>>> application dealing with this.
>>
>> If I'm correct the example also serialises sends(?). I don't
>> think it's that simpler. You batch, you send. Same with this,
>> but batch into a provided buffer and the send is conditional.
> 
> Do you mean the proxy example? Just want to be sure we're talking about

Yes, proxy, the one you referenced in the CV. And FWIW, I don't
think it's a fair comparison without batching followed by multi-iov.

> the same thing. Yes it has to serialize sends, because otherwise we can
> run into the condition described in the patch that adds provided buffer
> support for send. But I did bench multishot separately from there,
> here's some of it:
> 
> 10G network, 3 hosts, 1 acting as a mirror proxy shuffling N-byte packets.
> Send ring and send multishot not used:
> 
> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw
> =====================================================
> 1000   |    No	   |  No   |   437  | 1.22M | 9598M
> 32     |    No	   |  No   |  5856  | 2.87M |  734M
> 
> Same test, now turn on send ring:
> 
> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
> ===========================================================
> 1000   |    Yes	   |  No   |   436  | 1.23M | 9620M | + 0.2%
> 32     |    Yes	   |  No   |  3462  | 4.85M | 1237M | +68.5%
> 
> Same test, now turn on send mshot as well:
> 
> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
> ===========================================================
> 1000   |    Yes	   |  Yes  |   436  | 1.23M | 9620M | + 0.2%
> 32     |    Yes	   |  Yes  |  3125  | 5.37M | 1374M | +87.2%
> 
> which does show that there's another win on top for just queueing these
> sends and doing a single send to handle them, rather than needing to
> prepare a send for each buffer. Part of that may be that you simply run
> out of SQEs and then have to submit regardless of where you are at.

How many sockets did you test with? It's 1 SQE per sock max

+87% sounds like a huge difference, and I don't understand where
it comes from, hence the question

>> Another downside is that you need a provided queue per socket,
>> which sounds pretty expensive for 100s if not 1000s socket
>> apps.
> 
> That's certainly true. But either you need backlog per socket anyway in
> the app, or you only send single buffers anyway (in a typical
> request/response kind of fashion) between receives and you don't need it
> at all.

That's pinning pages and maping them, which surely is not bad
but with everything else equal malloc()/stack alloc is much
nicer in terms of resources. (Not talking about CPU setup
overhead).

>>>> If you actually need to poll tx, you send a request and collect
>>>> data into iov in userspace in background. When the request
>>>> completes you send all that in batch. You can probably find
>>>> a niche example when batch=1 in this case, but I don't think
>>>> anyone would care.
>>>>
>>>> The example doesn't use multi-iov, and also still has to
>>>> serialise requests, which naturally serialises buffer consumption
>>>> w/o provided bufs.
>>>
>>> IMHO there's no reason NOT to have both a send with provided buffers and
>>> a multishot send. The alternative would be to have send-N, where you
>>> pass in N. But I don't see much point to that over "just drain the whole
>>> pending list". The obvious use case is definitely send multishot, but
>>
>> Not sure I follow, but in all cases I was contemplating about
>> you sends everything you have at the moment.
>>
>>> what would the reasoning be to prohibit pacing by explicitly disallowing
>>> only doing a single buffer (or a partial queue)? As mentioned earlier, I
>>> like keeping the symmetry with the receive side for multishot, and not
>>> make it any different unless there's a reason to.
>>
>> There are different, buffer content kernel (rx) vs userspace (tx)
>> provided, provided queue / group per socket vs shared. Wake ups
>> for multishots as per below. It's not like it's a one line change,
>> so IMHO requires to be giving some benefits.
> 
> Are you talking about provided buffers, or multishot specifically? I

I assumed that any of them would retry until the queue is exhausted,
at least that sounds more efficient and used in all comments.

> think both are standalone pretty much as simple as they can be. And if
> the argument is "just have send with provided buffers be multishot by
> default",

It's not, rx and tx are different, e.g. true tx multishot doesn't
seem to be possible because of that.

> then that single patch is basically the two patches combined.
> There's no simplification there. Outside of a strong argument for why it
> would never make sense to do single shot send with provided buffers, I
> really don't want to combine them into one single action.

In the current form it does make more sense to have
multishot optionally.

>>>>>> You do make a good point about MSG_WAITALL though - multishot send
>>>>>> doesn't really make sense to me without MSG_WAITALL semantics. I
>>>>>> cannot imagine a useful use case where the first buffer being
>>>>>> partially sent will still want the second buffer sent.
>>>>>
>>>>> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
>>>>> make it implied for multishot send. Currently the code doesn't deal with
>>>>> that.
>>>>>
>>>>> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
>>>>> CQE_F_MORE and we just stop. If it is set, then we go through the usual
>>>>> retry logic. That would make it identical to MSG_WAITALL send without
>>>>> multishot, which again is something I like in that we don't have
>>>>> different behaviors depending on which mode we are using.
>>>>>
>>>>>>>> I actually could imagine it being useful for the previous patches' use
>>>>>>>> case of queuing up sends and keeping ordering,
>>>>>>>> and I think the API is more obvious (rather than the second CQE
>>>>>>>> sending the first CQE's data). So maybe it's worth only
>>>>>>>> keeping one approach?
>>>>>>>
>>>>>>> And here you totally lost me :-)
>>>>>>
>>>>>> I am suggesting here that you don't really need to support buffer
>>>>>> lists on send without multishot.
>>>>>
>>>>> That is certainly true, but I also don't see a reason _not_ to support
>>>>> it. Again mostly because this is how receive and everything else works.
>>>>> The app is free to issue a single SQE for send without multishot, and
>>>>> pick the first buffer and send it.
>>>>
>>>> Multishot sound interesting, but I don't see it much useful if
>>>> you terminate when there are no buffers. Otherwise, if it continues
>>>> to sit in, someone would have to wake it up
>>>
>>> I did think about the termination case, and the problem is that if there
>>> are no buffers, you need it to wake when there are buffers. And at that
>>> point you may as well just do another send, as you need the application
>>> to trigger it. The alternative would be to invent a way to trigger that
>>> wakeup, which would be send only and weird just because of that.
>>
>> Yeah, that's the point, wake ups would be userspace driven, and how
>> to do it without heavy stuff like syscalls is not so clear.
> 
> It's just not possible without eg polling, either directly or using some
> monitor/mwait arch specific thing which would be awful. Or by doing some
> manual wakeup, which would need to lookup and kick the request, which I
> bet would be worse than just re-arming the send multishot.

Right

> If you could poll trigger it somehow, it also further complicates things
> as now it could potentially happen at any time. As it stands, the app
> knows when a poll multishot is armed (and submitted, or not), and can
> serialize with the outgoing buffer queue trivially.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 14:27         ` Jens Axboe
  2024-02-26 14:36           ` Pavel Begunkov
@ 2024-02-26 19:31           ` Dylan Yudaken
  2024-02-26 19:49             ` Jens Axboe
  1 sibling, 1 reply; 34+ messages in thread
From: Dylan Yudaken @ 2024-02-26 19:31 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Mon, Feb 26, 2024 at 2:27 PM Jens Axboe <[email protected]> wrote:
>
> > You do make a good point about MSG_WAITALL though - multishot send
> > doesn't really make sense to me without MSG_WAITALL semantics. I
> > cannot imagine a useful use case where the first buffer being
> > partially sent will still want the second buffer sent.
>
> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
> make it implied for multishot send. Currently the code doesn't deal with
> that.
>
> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
> CQE_F_MORE and we just stop. If it is set, then we go through the usual
> retry logic. That would make it identical to MSG_WAITALL send without
> multishot, which again is something I like in that we don't have
> different behaviors depending on which mode we are using.
>

It sounds like the right approach and is reasonably obvious. (I see
this is in v4 already)

Dylan

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 19:31           ` Dylan Yudaken
@ 2024-02-26 19:49             ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 19:49 UTC (permalink / raw)
  To: Dylan Yudaken; +Cc: io-uring

On 2/26/24 12:31 PM, Dylan Yudaken wrote:
> On Mon, Feb 26, 2024 at 2:27?PM Jens Axboe <[email protected]> wrote:
>>
>>> You do make a good point about MSG_WAITALL though - multishot send
>>> doesn't really make sense to me without MSG_WAITALL semantics. I
>>> cannot imagine a useful use case where the first buffer being
>>> partially sent will still want the second buffer sent.
>>
>> Right, and I need to tweak that. Maybe we require MSG_WAITALL, or we
>> make it implied for multishot send. Currently the code doesn't deal with
>> that.
>>
>> Maybe if MSG_WAITALL isn't set and we get a short send we don't set
>> CQE_F_MORE and we just stop. If it is set, then we go through the usual
>> retry logic. That would make it identical to MSG_WAITALL send without
>> multishot, which again is something I like in that we don't have
>> different behaviors depending on which mode we are using.
>>
> 
> It sounds like the right approach and is reasonably obvious. (I see
> this is in v4 already)

Yep, thanks for bringing attention to it! I wrote it up in the man pages
as well. At least to me, it's what I would expect to be the case.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 19:21                   ` Pavel Begunkov
@ 2024-02-26 20:12                     ` Jens Axboe
  2024-02-26 20:51                       ` Pavel Begunkov
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 20:12 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/26/24 12:21 PM, Pavel Begunkov wrote:
> On 2/26/24 19:11, Jens Axboe wrote:
>> On 2/26/24 8:41 AM, Pavel Begunkov wrote:
>>> On 2/26/24 15:16, Jens Axboe wrote:
>>>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe
>>>>>>> <[email protected]> wrote:
>>>>>>>> 
>>>>>>>> On 2/26/24 3:47 AM, Dylan Yudaken wrote:
>>>>>>>>> On Sun, Feb 25, 2024 at 12:46?AM Jens Axboe
>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>> 
>>>>>>>>>> This works very much like the receive side, except
>>>>>>>>>> for sends. The idea is that an application can fill
>>>>>>>>>> outgoing buffers in a provided buffer group, and
>>>>>>>>>> then arm a single send that will service them all.
>>>>>>>>>> For now this variant just terminates when we are
>>>>>>>>>> out of buffers to send, and hence the application
>>>>>>>>>> needs to re-arm it if IORING_CQE_F_MORE isn't set,
>>>>>>>>>> as per usual for multishot requests.
>>>>>>>>>> 
>>>>>>>>> 
>>>>>>>>> This feels to me a lot like just using OP_SEND with
>>>>>>>>> MSG_WAITALL as described, unless I'm missing
>>>>>>>>> something?
>>>>>>>> 
>>>>>>>> How so? MSG_WAITALL is "send X amount of data, and if
>>>>>>>> it's a short send, try again" where multishot is "send
>>>>>>>> data from this buffer group, and keep sending data
>>>>>>>> until it's empty". Hence it's the mirror of multishot 
>>>>>>>> on the receive side. Unless I'm misunderstanding you
>>>>>>>> somehow, not sure it'd be smart to add special meaning
>>>>>>>> to MSG_WAITALL with provided buffers.
>>>>>>>> 
>>>>>>> 
>>>>>>> _If_ you have the data upfront these are very similar,
>>>>>>> and only differ in that the multishot approach will give
>>>>>>> you more granular progress updates. My point was that
>>>>>>> this might not be a valuable API to people for only this
>>>>>>> use case.
>>>>>> 
>>>>>> Not sure I agree, it feels like attributing a different
>>>>>> meaning to MSG_WAITALL if you use a provided buffer vs if
>>>>>> you don't. And that to me would seem to be confusing.
>>>>>> Particularly when we have multishot on the receive side,
>>>>>> and this is identical, just for sends. Receives will keep 
>>>>>> receiving as long as there are buffers in the provided
>>>>>> group to receive into, and sends will keep sending for the
>>>>>> same condition. Either one will terminate if we run out of
>>>>>> buffers.
>>>>>> 
>>>>>> If you make MSG_WAITALL be that for provided buffers +
>>>>>> send, then that behaves differently than MSG_WAITALL with
>>>>>> receive, and MSG_WAITALL with send _without_ provided
>>>>>> buffers. I don't think overloading an existing flag for
>>>>>> this purposes is a good idea, particularly when we already
>>>>>> have the existing semantics for multishot on the receive
>>>>>> side.
>>>>> 
>>>>> I'm actually with Dylan on that and wonder where the perf
>>>>> win could come from. Let's assume TCP, sends are usually
>>>>> completed in the same syscall, otherwise your pacing is just
>>>>> bad. Thrift, for example, collects sends and packs into one
>>>>> multi iov request during a loop iteration. If the req
>>>>> completes immediately then the userspace just wouldn't have
>>>>> time to push more buffers by definition (assuming single
>>>>> threading).
>>>> 
>>>> The problem only occurs when they don't complete inline, and
>>>> now you get reordering. The application could of course attempt
>>>> to do proper pacing and see if it can avoid that condition. If
>>>> not, it now needs to
>>> 
>>> Ok, I admit that there are more than valid cases when artificial
>>> pacing is not an option, which is why I also laid out the polling
>>> case. Let's also say that limits potential perf wins to streaming
>>> and very large transfers (like files), not "lots of relatively
>>> small request-response" kinds of apps.
>> 
>> I don't think that's true - if you're doing large streaming, you're
>> more likely to keep the socket buffer full, whereas for smallish
>> sends, it's less likely to be full. Testing with the silly proxy
>> confirms that. And
> 
> I don't see any contradiction to what I said. With streaming/large 
> sends it's more likely to be polled. For small sends and 
> send-receive-send-... patterns the sock queue is unlikely to be full,
> in which case the send is processed inline, and so the feature
> doesn't add performance, as you agreed a couple email before.

Gotcha, I guess I misread you, we agree that the poll side is more
likely on bigger buffers.

>> outside of cases where pacing just isn't feasible, it's extra
>> overhead for cases where you potentially could or what.
> 
> I lost it, what overhead?

Overhead of needing to serialize the sends in the application, which may
include both extra memory needed and overhead in dealing with it.

>> To me, the main appeal of this is the simplicity.
> 
> I'd argue it doesn't seem any simpler than the alternative.

It's certainly simpler for an application to do "add buffer to queue"
and not need to worry about managing sends, than it is to manage a
backlog of only having a single send active.

>>>> serialize sends. Using provided buffers makes this very easy,
>>>> as you don't need to care about it at all, and it eliminates
>>>> complexity in the application dealing with this.
>>> 
>>> If I'm correct the example also serialises sends(?). I don't 
>>> think it's that simpler. You batch, you send. Same with this, but
>>> batch into a provided buffer and the send is conditional.
>> 
>> Do you mean the proxy example? Just want to be sure we're talking
>> about
> 
> Yes, proxy, the one you referenced in the CV. And FWIW, I don't think
> it's a fair comparison without batching followed by multi-iov.

It's not about vectored vs non-vectored IO, though you could of course
need to allocate an arbitrarily sized iovec that you can append to. And
now you need to use sendmsg rather than just send, which has further
overhead on top of send.

What kind of batching? The batching done by the tests are the same,
regardless of whether or not send multishot is used in the sense that we
wait on the same number of completions. As it's a basic proxy kind of
thing, it'll receive a packet and send a packet. Submission batching is
the same too, we'll submit when we have to.

>> the same thing. Yes it has to serialize sends, because otherwise we
>> can run into the condition described in the patch that adds
>> provided buffer support for send. But I did bench multishot
>> separately from there, here's some of it:
>> 
>> 10G network, 3 hosts, 1 acting as a mirror proxy shuffling N-byte
>> packets. Send ring and send multishot not used:
>> 
>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw 
>> ===================================================== 1000   |
>> No       |  No   |   437  | 1.22M | 9598M 32     |    No       |
>> No   |  5856  | 2.87M |  734M
>> 
>> Same test, now turn on send ring:
>> 
>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff 
>> =========================================================== 1000
>> |    Yes       |  No   |   436  | 1.23M | 9620M | + 0.2% 32     |
>> Yes       |  No   |  3462  | 4.85M | 1237M | +68.5%
>> 
>> Same test, now turn on send mshot as well:
>> 
>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff 
>> =========================================================== 1000
>> |    Yes       |  Yes  |   436  | 1.23M | 9620M | + 0.2% 32     |
>> Yes       |  Yes  |  3125  | 5.37M | 1374M | +87.2%
>> 
>> which does show that there's another win on top for just queueing
>> these sends and doing a single send to handle them, rather than
>> needing to prepare a send for each buffer. Part of that may be that
>> you simply run out of SQEs and then have to submit regardless of
>> where you are at.
> 
> How many sockets did you test with? It's 1 SQE per sock max

The above is just one, but I've run it with a lot more sockets. Nothing
ilke thousands, but 64-128.

> +87% sounds like a huge difference, and I don't understand where it
> comes from, hence the question

There are several things:

1) Fact is that the app has to serialize sends for the unlikely case
   of sends being reordered because of the condition outlined in the
   patch that enables provided buffer support for send. This is the
   largest win, particularly with smaller packets, as it ruins the
   send pipeline.

2) We're posting fewer SQEs. That's the multishot win. Obivously not
   as large, but it does help.

People have asked in the past on how to serialize sends, and I've had to
tell them that it isn't really possible. The only option we had was
using drain or links, which aren't ideal nor very flexible. Using
provided buffers finally gives the application a way to do that without
needing to do anything really. Does every application need it? Certainly
not, but for the ones that do, I do think it provides a great
alternative that's better performing than doing single sends at the
time.

>>> Another downside is that you need a provided queue per socket, 
>>> which sounds pretty expensive for 100s if not 1000s socket apps.
>> 
>> That's certainly true. But either you need backlog per socket
>> anyway in the app, or you only send single buffers anyway (in a
>> typical request/response kind of fashion) between receives and you
>> don't need it at all.
> 
> That's pinning pages and maping them, which surely is not bad but
> with everything else equal malloc()/stack alloc is much nicer in
> terms of resources. (Not talking about CPU setup overhead).

Sure, it's not free in terms of memory either. As mentioned several
times, the main win is on efficiency and in reducing complexity, and
both of those are pretty nice imho.

>>>>> If you actually need to poll tx, you send a request and
>>>>> collect data into iov in userspace in background. When the
>>>>> request completes you send all that in batch. You can
>>>>> probably find a niche example when batch=1 in this case, but
>>>>> I don't think anyone would care.
>>>>> 
>>>>> The example doesn't use multi-iov, and also still has to 
>>>>> serialise requests, which naturally serialises buffer
>>>>> consumption w/o provided bufs.
>>>> 
>>>> IMHO there's no reason NOT to have both a send with provided
>>>> buffers and a multishot send. The alternative would be to have
>>>> send-N, where you pass in N. But I don't see much point to that
>>>> over "just drain the whole pending list". The obvious use case
>>>> is definitely send multishot, but
>>> 
>>> Not sure I follow, but in all cases I was contemplating about you
>>> sends everything you have at the moment.
>>> 
>>>> what would the reasoning be to prohibit pacing by explicitly
>>>> disallowing only doing a single buffer (or a partial queue)? As
>>>> mentioned earlier, I like keeping the symmetry with the receive
>>>> side for multishot, and not make it any different unless
>>>> there's a reason to.
>>> 
>>> There are different, buffer content kernel (rx) vs userspace
>>> (tx) provided, provided queue / group per socket vs shared. Wake
>>> ups for multishots as per below. It's not like it's a one line
>>> change, so IMHO requires to be giving some benefits.
>> 
>> Are you talking about provided buffers, or multishot specifically?
>> I
> 
> I assumed that any of them would retry until the queue is exhausted, 
> at least that sounds more efficient and used in all comments.

That is what it does, it'll keep sending until it runs out of buffers
(or hits an error, short send, whatever).

>> think both are standalone pretty much as simple as they can be. And
>> if the argument is "just have send with provided buffers be
>> multishot by default",
> 
> It's not, rx and tx are different, e.g. true tx multishot doesn't 
> seem to be possible because of that.

In the sense that rx and poll trigger on data now being available isn't
feasible on send, yeah they are not exact mirrors of each other. But
they are as close as they can be. If there was, or ever will be, an
efficient way to re-trigger a multishot send, that would certainly be a
doable and an easy addition to make on top of this. It really only
changes the termination point, if you run out of buffers you just go to
whatever arming method would be suitable for that. But since the reason
for recv multishot is to avoid hammering on the locking on the poll
side, I'm not convinced that having a perpetual multishot send would
make a lot more sense than simply doing another one when needed. If
you're socket buffer bound on multishot send, then the perpetual poll
trigger works and is useful.

>> then that single patch is basically the two patches combined. 
>> There's no simplification there. Outside of a strong argument for
>> why it would never make sense to do single shot send with provided
>> buffers, I really don't want to combine them into one single
>> action.
> 
> In the current form it does make more sense to have multishot
> optionally.

I obviously agree on that too, kept them separate in the v4 posting as
well.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 20:12                     ` Jens Axboe
@ 2024-02-26 20:51                       ` Pavel Begunkov
  2024-02-26 21:27                         ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-26 20:51 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken; +Cc: io-uring

On 2/26/24 20:12, Jens Axboe wrote:
> On 2/26/24 12:21 PM, Pavel Begunkov wrote:
>> On 2/26/24 19:11, Jens Axboe wrote:
>>> On 2/26/24 8:41 AM, Pavel Begunkov wrote:
>>>> On 2/26/24 15:16, Jens Axboe wrote:
>>>>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>>>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe
...
>>> I don't think that's true - if you're doing large streaming, you're
>>> more likely to keep the socket buffer full, whereas for smallish
>>> sends, it's less likely to be full. Testing with the silly proxy
>>> confirms that. And
>>
>> I don't see any contradiction to what I said. With streaming/large
>> sends it's more likely to be polled. For small sends and
>> send-receive-send-... patterns the sock queue is unlikely to be full,
>> in which case the send is processed inline, and so the feature
>> doesn't add performance, as you agreed a couple email before.
> 
> Gotcha, I guess I misread you, we agree that the poll side is more
> likely on bigger buffers.
> 
>>> outside of cases where pacing just isn't feasible, it's extra
>>> overhead for cases where you potentially could or what.
>>
>> I lost it, what overhead?
> 
> Overhead of needing to serialize the sends in the application, which may
> include both extra memory needed and overhead in dealing with it.

I think I misread the code. Does it push 1 request for each
send buffer / queue_send() in case of provided bufs?

Anyway, the overhead of serialisation would be negligent.
And that's same extra memory you keep for the provided buffer
pool, and you can allocate it once. Also consider that provided
buffers are fixed size and it'd be hard to resize without waiting,
thus the userspace would still need to have another, userspace
backlog, it can't just drop requests. Or you make provided queues
extra large, but it's per socket and you'd wasting lots of memory.

IOW, I don't think this overhead could anyhow close us to
the understanding of the 30%+ perf gap.
  
>>> To me, the main appeal of this is the simplicity.
>>
>> I'd argue it doesn't seem any simpler than the alternative.
> 
> It's certainly simpler for an application to do "add buffer to queue"
> and not need to worry about managing sends, than it is to manage a
> backlog of only having a single send active.

They still need to manage / re-queue sends. And maybe I
misunderstand the point, but it's only one request inflight
per socket in either case.
  

>>>>> serialize sends. Using provided buffers makes this very easy,
>>>>> as you don't need to care about it at all, and it eliminates
>>>>> complexity in the application dealing with this.
>>>>
>>>> If I'm correct the example also serialises sends(?). I don't
>>>> think it's that simpler. You batch, you send. Same with this, but
>>>> batch into a provided buffer and the send is conditional.
>>>
>>> Do you mean the proxy example? Just want to be sure we're talking
>>> about
>>
>> Yes, proxy, the one you referenced in the CV. And FWIW, I don't think
>> it's a fair comparison without batching followed by multi-iov.
> 
> It's not about vectored vs non-vectored IO, though you could of course
> need to allocate an arbitrarily sized iovec that you can append to. And
> now you need to use sendmsg rather than just send, which has further
> overhead on top of send.

That's not nearly enough of overhead to explain the difference,
I don't believe so, going through the net stack is quite expensive.

> What kind of batching? The batching done by the tests are the same,
> regardless of whether or not send multishot is used in the sense that we

You can say that, but I say that it moves into the kernel
batching that can be implemented in userspace.

> wait on the same number of completions. As it's a basic proxy kind of
> thing, it'll receive a packet and send a packet. Submission batching is
> the same too, we'll submit when we have to.

"If you actually need to poll tx, you send a request and collect
data into iov in userspace in background. When the request
completes you send all that in batch..."

That's how it's in Thrift for example.

In terms of "proxy", the first approximation would be to
do sth like defer_send() for normal requests as well, then

static void __queue_send(struct io_uring *ring, struct conn *c, int fd,
			 void *data, int bid, int len)
{
	...

	defer_send(data);

	while (buf = defer_backlog.get()) {
		iov[idx++] = buf;
	}
	msghdr->iovlen = idx;
	...
}

>>> the same thing. Yes it has to serialize sends, because otherwise we
>>> can run into the condition described in the patch that adds
>>> provided buffer support for send. But I did bench multishot
>>> separately from there, here's some of it:
>>>
>>> 10G network, 3 hosts, 1 acting as a mirror proxy shuffling N-byte
>>> packets. Send ring and send multishot not used:
>>>
>>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw
>>> ===================================================== 1000   |
>>> No       |  No   |   437  | 1.22M | 9598M 32     |    No       |
>>> No   |  5856  | 2.87M |  734M
>>>
>>> Same test, now turn on send ring:
>>>
>>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
>>> =========================================================== 1000
>>> |    Yes       |  No   |   436  | 1.23M | 9620M | + 0.2% 32     |
>>> Yes       |  No   |  3462  | 4.85M | 1237M | +68.5%
>>>
>>> Same test, now turn on send mshot as well:
>>>
>>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
>>> =========================================================== 1000
>>> |    Yes       |  Yes  |   436  | 1.23M | 9620M | + 0.2% 32     |
>>> Yes       |  Yes  |  3125  | 5.37M | 1374M | +87.2%
>>>
>>> which does show that there's another win on top for just queueing
>>> these sends and doing a single send to handle them, rather than
>>> needing to prepare a send for each buffer. Part of that may be that
>>> you simply run out of SQEs and then have to submit regardless of
>>> where you are at.
>>
>> How many sockets did you test with? It's 1 SQE per sock max
> 
> The above is just one, but I've run it with a lot more sockets. Nothing
> ilke thousands, but 64-128.
> 
>> +87% sounds like a huge difference, and I don't understand where it
>> comes from, hence the question
> 
> There are several things:
> 
> 1) Fact is that the app has to serialize sends for the unlikely case
>     of sends being reordered because of the condition outlined in the
>     patch that enables provided buffer support for send. This is the
>     largest win, particularly with smaller packets, as it ruins the
>     send pipeline.

Do those small packets force it to poll?

> 2) We're posting fewer SQEs. That's the multishot win. Obivously not
>     as large, but it does help.
> 
> People have asked in the past on how to serialize sends, and I've had to
> tell them that it isn't really possible. The only option we had was
> using drain or links, which aren't ideal nor very flexible. Using
> provided buffers finally gives the application a way to do that without
> needing to do anything really. Does every application need it? Certainly
> not, but for the ones that do, I do think it provides a great
> alternative that's better performing than doing single sends at the
> time.

As per note on additional userspace backlog, any real generic app
and especially libs would need to do more to support it.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 20:51                       ` Pavel Begunkov
@ 2024-02-26 21:27                         ` Jens Axboe
  2024-02-28 12:39                           ` Pavel Begunkov
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-26 21:27 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/26/24 1:51 PM, Pavel Begunkov wrote:
> On 2/26/24 20:12, Jens Axboe wrote:
>> On 2/26/24 12:21 PM, Pavel Begunkov wrote:
>>> On 2/26/24 19:11, Jens Axboe wrote:
>>>> On 2/26/24 8:41 AM, Pavel Begunkov wrote:
>>>>> On 2/26/24 15:16, Jens Axboe wrote:
>>>>>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>>>>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>>>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe
> ...
>>>> I don't think that's true - if you're doing large streaming, you're
>>>> more likely to keep the socket buffer full, whereas for smallish
>>>> sends, it's less likely to be full. Testing with the silly proxy
>>>> confirms that. And
>>>
>>> I don't see any contradiction to what I said. With streaming/large
>>> sends it's more likely to be polled. For small sends and
>>> send-receive-send-... patterns the sock queue is unlikely to be full,
>>> in which case the send is processed inline, and so the feature
>>> doesn't add performance, as you agreed a couple email before.
>>
>> Gotcha, I guess I misread you, we agree that the poll side is more
>> likely on bigger buffers.
>>
>>>> outside of cases where pacing just isn't feasible, it's extra
>>>> overhead for cases where you potentially could or what.
>>>
>>> I lost it, what overhead?
>>
>> Overhead of needing to serialize the sends in the application, which may
>> include both extra memory needed and overhead in dealing with it.
> 
> I think I misread the code. Does it push 1 request for each
> send buffer / queue_send() in case of provided bufs?

Right, that's the way it's currently setup. Per send (per loop), if
you're using provided buffers, it'll do a send per buffer. If using
multishot on top of that, it'll do one send per loop regardless of the
number of buffers.

> Anyway, the overhead of serialisation would be negligent.
> And that's same extra memory you keep for the provided buffer
> pool, and you can allocate it once. Also consider that provided
> buffers are fixed size and it'd be hard to resize without waiting,
> thus the userspace would still need to have another, userspace
> backlog, it can't just drop requests. Or you make provided queues
> extra large, but it's per socket and you'd wasting lots of memory.
> 
> IOW, I don't think this overhead could anyhow close us to
> the understanding of the 30%+ perf gap.

The 32-byte case is obviously somewhat pathological, as you're going to
be much better off having a bunch of these pipelined rather than issued
serially. As you can see from the 1000 byte packets, at that point it
doesn't matter that much. It's mostly about making it simpler at that
point.

>>>> To me, the main appeal of this is the simplicity.
>>>
>>> I'd argue it doesn't seem any simpler than the alternative.
>>
>> It's certainly simpler for an application to do "add buffer to queue"
>> and not need to worry about managing sends, than it is to manage a
>> backlog of only having a single send active.
> 
> They still need to manage / re-queue sends. And maybe I
> misunderstand the point, but it's only one request inflight
> per socket in either case.

Sure, but one is a manageable condition, the other one is not. If you
can keep N inflight at the same time and only abort the chain in case of
error/short send, that's a corner case. Versus not knowing when things
get reordered, and hence always needing to serialize.

>>>>>> serialize sends. Using provided buffers makes this very easy,
>>>>>> as you don't need to care about it at all, and it eliminates
>>>>>> complexity in the application dealing with this.
>>>>>
>>>>> If I'm correct the example also serialises sends(?). I don't
>>>>> think it's that simpler. You batch, you send. Same with this, but
>>>>> batch into a provided buffer and the send is conditional.
>>>>
>>>> Do you mean the proxy example? Just want to be sure we're talking
>>>> about
>>>
>>> Yes, proxy, the one you referenced in the CV. And FWIW, I don't think
>>> it's a fair comparison without batching followed by multi-iov.
>>
>> It's not about vectored vs non-vectored IO, though you could of course
>> need to allocate an arbitrarily sized iovec that you can append to. And
>> now you need to use sendmsg rather than just send, which has further
>> overhead on top of send.
> 
> That's not nearly enough of overhead to explain the difference,
> I don't believe so, going through the net stack is quite expensive.

See above, for the 32-byte packets, it's not hard to imagine big wins by
having many shoved in vs doing them piecemeal.

And honestly, I was surprised at how well the stack deals with this on
the networking side! It may have room for improvement, but it's not
nearly as sluggish as I feared.

>> What kind of batching? The batching done by the tests are the same,
>> regardless of whether or not send multishot is used in the sense that we
> 
> You can say that, but I say that it moves into the kernel
> batching that can be implemented in userspace.

And then most people get it wrong or just do the basic stuff, and
performance isn't very good. Getting the most out of it can be tricky
and require extensive testing and knowledge building. I'm confident
you'd be able to write an efficient version, but that's not the same as
saying "it's trivial to write an efficient version".

>> wait on the same number of completions. As it's a basic proxy kind of
>> thing, it'll receive a packet and send a packet. Submission batching is
>> the same too, we'll submit when we have to.
> 
> "If you actually need to poll tx, you send a request and collect
> data into iov in userspace in background. When the request
> completes you send all that in batch..."
> 
> That's how it's in Thrift for example.
> 
> In terms of "proxy", the first approximation would be to
> do sth like defer_send() for normal requests as well, then
> 
> static void __queue_send(struct io_uring *ring, struct conn *c, int fd,
>              void *data, int bid, int len)
> {
>     ...
> 
>     defer_send(data);
> 
>     while (buf = defer_backlog.get()) {
>         iov[idx++] = buf;
>     }
>     msghdr->iovlen = idx;
>     ...
> }

Yep, that's the iovec coalescing, and that could certainly be done. And
then you need to size the iov[] so that it's always big enough, OR
submit that send and still deal with managing your own backlog.

I don't think we disagree that there are other solutions. I'm saying
that I like this solution. I think it's simple to use for the cases that
can use it, and that's why the patches exist. It fits with the notion of
an async API being able to keep multiple things in flight, rather than a
semi solution where you kind of can, except not for cases X and Y
because of corner cases.

>>>> the same thing. Yes it has to serialize sends, because otherwise we
>>>> can run into the condition described in the patch that adds
>>>> provided buffer support for send. But I did bench multishot
>>>> separately from there, here's some of it:
>>>>
>>>> 10G network, 3 hosts, 1 acting as a mirror proxy shuffling N-byte
>>>> packets. Send ring and send multishot not used:
>>>>
>>>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw
>>>> ===================================================== 1000   |
>>>> No       |  No   |   437  | 1.22M | 9598M 32     |    No       |
>>>> No   |  5856  | 2.87M |  734M
>>>>
>>>> Same test, now turn on send ring:
>>>>
>>>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
>>>> =========================================================== 1000
>>>> |    Yes       |  No   |   436  | 1.23M | 9620M | + 0.2% 32     |
>>>> Yes       |  No   |  3462  | 4.85M | 1237M | +68.5%
>>>>
>>>> Same test, now turn on send mshot as well:
>>>>
>>>> Pkt sz | Send ring | mshot |  usec  |  QPS  |  Bw   | Diff
>>>> =========================================================== 1000
>>>> |    Yes       |  Yes  |   436  | 1.23M | 9620M | + 0.2% 32     |
>>>> Yes       |  Yes  |  3125  | 5.37M | 1374M | +87.2%
>>>>
>>>> which does show that there's another win on top for just queueing
>>>> these sends and doing a single send to handle them, rather than
>>>> needing to prepare a send for each buffer. Part of that may be that
>>>> you simply run out of SQEs and then have to submit regardless of
>>>> where you are at.
>>>
>>> How many sockets did you test with? It's 1 SQE per sock max
>>
>> The above is just one, but I've run it with a lot more sockets. Nothing
>> ilke thousands, but 64-128.
>>
>>> +87% sounds like a huge difference, and I don't understand where it
>>> comes from, hence the question
>>
>> There are several things:
>>
>> 1) Fact is that the app has to serialize sends for the unlikely case
>>     of sends being reordered because of the condition outlined in the
>>     patch that enables provided buffer support for send. This is the
>>     largest win, particularly with smaller packets, as it ruins the
>>     send pipeline.
> 
> Do those small packets force it to poll?

There's no polling in my testing.

>> 2) We're posting fewer SQEs. That's the multishot win. Obivously not
>>     as large, but it does help.
>>
>> People have asked in the past on how to serialize sends, and I've had to
>> tell them that it isn't really possible. The only option we had was
>> using drain or links, which aren't ideal nor very flexible. Using
>> provided buffers finally gives the application a way to do that without
>> needing to do anything really. Does every application need it? Certainly
>> not, but for the ones that do, I do think it provides a great
>> alternative that's better performing than doing single sends at the
>> time.
> 
> As per note on additional userspace backlog, any real generic app
> and especially libs would need to do more to support it.

Sure, if you get a short send or any abort in the chain, you need to
handle it. But things stall/stop at that point and you handle it, and
then you're back up and running. This is vastly different from "oh I
always need to serialize because X or Y may happen, even though it
rarely does or never does in my case".

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-26 21:27                         ` Jens Axboe
@ 2024-02-28 12:39                           ` Pavel Begunkov
  2024-02-28 17:28                             ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Pavel Begunkov @ 2024-02-28 12:39 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken; +Cc: io-uring

On 2/26/24 21:27, Jens Axboe wrote:
> On 2/26/24 1:51 PM, Pavel Begunkov wrote:
>> On 2/26/24 20:12, Jens Axboe wrote:
>>> On 2/26/24 12:21 PM, Pavel Begunkov wrote:
>>>> On 2/26/24 19:11, Jens Axboe wrote:
>>>>> On 2/26/24 8:41 AM, Pavel Begunkov wrote:
>>>>>> On 2/26/24 15:16, Jens Axboe wrote:
>>>>>>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>>>>>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>>>>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>>>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe
>> ...
>>>>> I don't think that's true - if you're doing large streaming, you're
>>>>> more likely to keep the socket buffer full, whereas for smallish
>>>>> sends, it's less likely to be full. Testing with the silly proxy
>>>>> confirms that. And
>>>>
>>>> I don't see any contradiction to what I said. With streaming/large
>>>> sends it's more likely to be polled. For small sends and
>>>> send-receive-send-... patterns the sock queue is unlikely to be full,
>>>> in which case the send is processed inline, and so the feature
>>>> doesn't add performance, as you agreed a couple email before.
>>>
>>> Gotcha, I guess I misread you, we agree that the poll side is more
>>> likely on bigger buffers.
>>>
>>>>> outside of cases where pacing just isn't feasible, it's extra
>>>>> overhead for cases where you potentially could or what.
>>>>
>>>> I lost it, what overhead?
>>>
>>> Overhead of needing to serialize the sends in the application, which may
>>> include both extra memory needed and overhead in dealing with it.
>>
>> I think I misread the code. Does it push 1 request for each
>> send buffer / queue_send() in case of provided bufs?
> 
> Right, that's the way it's currently setup. Per send (per loop), if
> you're using provided buffers, it'll do a send per buffer. If using
> multishot on top of that, it'll do one send per loop regardless of the
> number of buffers.

Ok, then I'd say the performance tests are misleading, at least
the proxy one. For selected buffers, sending tons of requests sounds
unnecessarily expensive, but I don't have numbers to back it. The
normal case must employ the natural batching it has with
serialisation.

It's important comparing programs that are optimised and close
to what is achievable in userspace. You wouldn't compare it with

while (i = 0; i < nr_bytes; i++)
	send(data+i, bytes=1);

and claim that it's N times faster. Or surely we can add an
empty "for" loop to one side and say "users are stupid and
will put garbage here anyway, so fair game", but then it
needs a note in small font "valid only for users who can't
write code up to standard"

static void defer_send() {
	ps = malloc(sizeof(*ps));
}

fwiw, maybe malloc is not really expensive there, but
that sounds like a pretty bad practice for a hi perf
benchmark.

>> Anyway, the overhead of serialisation would be negligent.
>> And that's same extra memory you keep for the provided buffer
>> pool, and you can allocate it once. Also consider that provided
>> buffers are fixed size and it'd be hard to resize without waiting,
>> thus the userspace would still need to have another, userspace
>> backlog, it can't just drop requests. Or you make provided queues
>> extra large, but it's per socket and you'd wasting lots of memory.
>>
>> IOW, I don't think this overhead could anyhow close us to
>> the understanding of the 30%+ perf gap.
> 
> The 32-byte case is obviously somewhat pathological, as you're going to
> be much better off having a bunch of these pipelined rather than issued
> serially. As you can see from the 1000 byte packets, at that point it
> doesn't matter that much. It's mostly about making it simpler at that
> point.
> 
>>>>> To me, the main appeal of this is the simplicity.
>>>>
>>>> I'd argue it doesn't seem any simpler than the alternative.
>>>
>>> It's certainly simpler for an application to do "add buffer to queue"
>>> and not need to worry about managing sends, than it is to manage a
>>> backlog of only having a single send active.
>>
>> They still need to manage / re-queue sends. And maybe I
>> misunderstand the point, but it's only one request inflight
>> per socket in either case.
> 
> Sure, but one is a manageable condition, the other one is not. If you
> can keep N inflight at the same time and only abort the chain in case of
> error/short send, that's a corner case. Versus not knowing when things
> get reordered, and hence always needing to serialize.

Manageable or not, you still have to implement all that, whereas
serialisation is not complex and I doubt it's anywhere expensive
enough to overturn the picture. It seems that multishot selected
bufs would also need serialisation, and for oneshots managing
multiple requests when you don't know which one sends what buffer
would be complicated in real scenarios.

>>> What kind of batching? The batching done by the tests are the same,
>>> regardless of whether or not send multishot is used in the sense that we
>>
>> You can say that, but I say that it moves into the kernel
>> batching that can be implemented in userspace.
> 
> And then most people get it wrong or just do the basic stuff, and
> performance isn't very good. Getting the most out of it can be tricky
> and require extensive testing and knowledge building. I'm confident
> you'd be able to write an efficient version, but that's not the same as
> saying "it's trivial to write an efficient version".

It actually _is_ trivial to write an efficient version for anyone
competent enough and having a spare day for that, which is usually
in a form of a library

I'm a firm believer of not putting into the kernel what can
already be well implemented in userspace, because the next step
could be "firefox can be implemented in userspace, but it requires
knowledge building, so let's implement it in the kernel". At
least I recall nginx / HTTP servers not flying well

>>> wait on the same number of completions. As it's a basic proxy kind of
>>> thing, it'll receive a packet and send a packet. Submission batching is
>>> the same too, we'll submit when we have to.
>>
>> "If you actually need to poll tx, you send a request and collect
>> data into iov in userspace in background. When the request
>> completes you send all that in batch..."
>>
>> That's how it's in Thrift for example.
>>
>> In terms of "proxy", the first approximation would be to
>> do sth like defer_send() for normal requests as well, then
>>
>> static void __queue_send(struct io_uring *ring, struct conn *c, int fd,
>>               void *data, int bid, int len)
>> {
>>      ...
>>
>>      defer_send(data);
>>
>>      while (buf = defer_backlog.get()) {
>>          iov[idx++] = buf;
>>      }
>>      msghdr->iovlen = idx;
>>      ...
>> }
> 
> Yep, that's the iovec coalescing, and that could certainly be done. And
> then you need to size the iov[] so that it's always big enough, OR
> submit that send and still deal with managing your own backlog.

Which is as trivial as iov.push_back() or a couple of lines with
realloc, whereas for selected buffers you would likely need to
wait for the previous request[s] to complete, which you cannot
do in place.

The point is, it seems that when you'd try to write something
error proof and real life ready, it wouldn't look simpler or
much simpler than the approach suggested, then the performance
is the question.

> I don't think we disagree that there are other solutions. I'm saying
> that I like this solution. I think it's simple to use for the cases that
> can use it, and that's why the patches exist. It fits with the notion of
> an async API being able to keep multiple things in flight, rather than a
> semi solution where you kind of can, except not for cases X and Y
> because of corner cases.
...

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-28 12:39                           ` Pavel Begunkov
@ 2024-02-28 17:28                             ` Jens Axboe
  2024-02-28 23:49                               ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-28 17:28 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/28/24 5:39 AM, Pavel Begunkov wrote:
> On 2/26/24 21:27, Jens Axboe wrote:
>> On 2/26/24 1:51 PM, Pavel Begunkov wrote:
>>> On 2/26/24 20:12, Jens Axboe wrote:
>>>> On 2/26/24 12:21 PM, Pavel Begunkov wrote:
>>>>> On 2/26/24 19:11, Jens Axboe wrote:
>>>>>> On 2/26/24 8:41 AM, Pavel Begunkov wrote:
>>>>>>> On 2/26/24 15:16, Jens Axboe wrote:
>>>>>>>> On 2/26/24 7:36 AM, Pavel Begunkov wrote:
>>>>>>>>> On 2/26/24 14:27, Jens Axboe wrote:
>>>>>>>>>> On 2/26/24 7:02 AM, Dylan Yudaken wrote:
>>>>>>>>>>> On Mon, Feb 26, 2024 at 1:38?PM Jens Axboe
>>> ...
>>>>>> I don't think that's true - if you're doing large streaming, you're
>>>>>> more likely to keep the socket buffer full, whereas for smallish
>>>>>> sends, it's less likely to be full. Testing with the silly proxy
>>>>>> confirms that. And
>>>>>
>>>>> I don't see any contradiction to what I said. With streaming/large
>>>>> sends it's more likely to be polled. For small sends and
>>>>> send-receive-send-... patterns the sock queue is unlikely to be full,
>>>>> in which case the send is processed inline, and so the feature
>>>>> doesn't add performance, as you agreed a couple email before.
>>>>
>>>> Gotcha, I guess I misread you, we agree that the poll side is more
>>>> likely on bigger buffers.
>>>>
>>>>>> outside of cases where pacing just isn't feasible, it's extra
>>>>>> overhead for cases where you potentially could or what.
>>>>>
>>>>> I lost it, what overhead?
>>>>
>>>> Overhead of needing to serialize the sends in the application, which may
>>>> include both extra memory needed and overhead in dealing with it.
>>>
>>> I think I misread the code. Does it push 1 request for each
>>> send buffer / queue_send() in case of provided bufs?
>>
>> Right, that's the way it's currently setup. Per send (per loop), if
>> you're using provided buffers, it'll do a send per buffer. If using
>> multishot on top of that, it'll do one send per loop regardless of the
>> number of buffers.
> 
> Ok, then I'd say the performance tests are misleading, at least
> the proxy one. For selected buffers, sending tons of requests sounds
> unnecessarily expensive, but I don't have numbers to back it. The
> normal case must employ the natural batching it has with
> serialisation.
> 
> It's important comparing programs that are optimised and close
> to what is achievable in userspace. You wouldn't compare it with
> 
> while (i = 0; i < nr_bytes; i++)
>     send(data+i, bytes=1);
> 
> and claim that it's N times faster. Or surely we can add an
> empty "for" loop to one side and say "users are stupid and
> will put garbage here anyway, so fair game", but then it
> needs a note in small font "valid only for users who can't
> write code up to standard"
> 
> static void defer_send() {
>     ps = malloc(sizeof(*ps));
> }
> 
> fwiw, maybe malloc is not really expensive there, but
> that sounds like a pretty bad practice for a hi perf
> benchmark.

It's comparing using send using a manually managed backlog, or using
send where you don't have to do that. I think that's pretty valid by
itself. If you don't do that, you need to use sendmsg and append iovecs.
Which is obviously also a very valid use case and would avoid the
backlog, at the cost of sendmsg being more expensive than send. If done
right, the sendmsg+append would be a lot closer to send with multishot.

When I have some time I can do add the append case, or feel free to do
that yourself, and I can run some testing with that too.

>>> Anyway, the overhead of serialisation would be negligent.
>>> And that's same extra memory you keep for the provided buffer
>>> pool, and you can allocate it once. Also consider that provided
>>> buffers are fixed size and it'd be hard to resize without waiting,
>>> thus the userspace would still need to have another, userspace
>>> backlog, it can't just drop requests. Or you make provided queues
>>> extra large, but it's per socket and you'd wasting lots of memory.
>>>
>>> IOW, I don't think this overhead could anyhow close us to
>>> the understanding of the 30%+ perf gap.
>>
>> The 32-byte case is obviously somewhat pathological, as you're going to
>> be much better off having a bunch of these pipelined rather than issued
>> serially. As you can see from the 1000 byte packets, at that point it
>> doesn't matter that much. It's mostly about making it simpler at that
>> point.
>>
>>>>>> To me, the main appeal of this is the simplicity.
>>>>>
>>>>> I'd argue it doesn't seem any simpler than the alternative.
>>>>
>>>> It's certainly simpler for an application to do "add buffer to queue"
>>>> and not need to worry about managing sends, than it is to manage a
>>>> backlog of only having a single send active.
>>>
>>> They still need to manage / re-queue sends. And maybe I
>>> misunderstand the point, but it's only one request inflight
>>> per socket in either case.
>>
>> Sure, but one is a manageable condition, the other one is not. If you
>> can keep N inflight at the same time and only abort the chain in case of
>> error/short send, that's a corner case. Versus not knowing when things
>> get reordered, and hence always needing to serialize.
> 
> Manageable or not, you still have to implement all that, whereas
> serialisation is not complex and I doubt it's anywhere expensive
> enough to overturn the picture. It seems that multishot selected
> bufs would also need serialisation, and for oneshots managing
> multiple requests when you don't know which one sends what buffer
> would be complicated in real scenarios.

What "all that"? It's pretty trivial when you have a normal abort
condition to handle it. For the sendmsg case, you can append multiple
iovecs, but you can still only have one sendmsg inflight. If you start
prepping the next one before the previous one has successfully
completed, you have the same issue again.

Only serialization you need is to only have one inflight, which is in
your best interest anyway as it would be more wasteful to submit several
which both empty that particular queue.

>>>> What kind of batching? The batching done by the tests are the same,
>>>> regardless of whether or not send multishot is used in the sense that we
>>>
>>> You can say that, but I say that it moves into the kernel
>>> batching that can be implemented in userspace.
>>
>> And then most people get it wrong or just do the basic stuff, and
>> performance isn't very good. Getting the most out of it can be tricky
>> and require extensive testing and knowledge building. I'm confident
>> you'd be able to write an efficient version, but that's not the same as
>> saying "it's trivial to write an efficient version".
> 
> It actually _is_ trivial to write an efficient version for anyone
> competent enough and having a spare day for that, which is usually
> in a form of a library

If using sendmsg.

> I'm a firm believer of not putting into the kernel what can
> already be well implemented in userspace, because the next step
> could be "firefox can be implemented in userspace, but it requires
> knowledge building, so let's implement it in the kernel". At
> least I recall nginx / HTTP servers not flying well

Sorry but that's nonsense, we add kernel features all the time that
could be done (just less efficiently) in userspace. And the firefox
comment is crazy hyperbole, not relevant here at all.

>>> In terms of "proxy", the first approximation would be to
>>> do sth like defer_send() for normal requests as well, then
>>>
>>> static void __queue_send(struct io_uring *ring, struct conn *c, int fd,
>>>               void *data, int bid, int len)
>>> {
>>>      ...
>>>
>>>      defer_send(data);
>>>
>>>      while (buf = defer_backlog.get()) {
>>>          iov[idx++] = buf;
>>>      }
>>>      msghdr->iovlen = idx;
>>>      ...
>>> }
>>
>> Yep, that's the iovec coalescing, and that could certainly be done. And
>> then you need to size the iov[] so that it's always big enough, OR
>> submit that send and still deal with managing your own backlog.
> 
> Which is as trivial as iov.push_back() or a couple of lines with
> realloc, whereas for selected buffers you would likely need to
> wait for the previous request[s] to complete, which you cannot
> do in place.

Go implement it and we can benchmark. It won't solve the sendmsg being
slower than send situation in general, but it should certainly get a lot
closer in terms of using sendmsg and serializing per send issue.

> The point is, it seems that when you'd try to write something
> error proof and real life ready, it wouldn't look simpler or
> much simpler than the approach suggested, then the performance
> is the question.

The condition for sendmsg with append and send with multishot handling
is basically the same, you need to deal checking the send result and
restarting your send from there. Obviously in practice this will
never/rarely happen unless the connection is aborted anyway. If you use
send multishot with MSG_WAITALL, then if the buffer fills you'll just
restart when it has space, nothing to handle there.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-28 17:28                             ` Jens Axboe
@ 2024-02-28 23:49                               ` Jens Axboe
  2024-02-29  1:46                                 ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-28 23:49 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/28/24 10:28 AM, Jens Axboe wrote:
> When I have some time I can do add the append case, or feel free to do
> that yourself, and I can run some testing with that too.

I did a quick hack to add the append mode, and by default we get roughly
ring_size / 2 number of appended vecs, which I guess is expected.
There's a few complications there:

1) We basically need a per-send data structure at this point. While
   that's not hard to do, at least for my case I'd need to add that just
   for this case and everything would now need to do it. Perhaps. You
   can perhaps steal a few low bits and only do it for sendmsg. But why?
   Because now you have multiple buffer IDs in a send completion, and
   you need to be able to unravel it. If we always receive and send in
   order, then it'd always been contiguous, which I took advantage of.
   Not a huge deal, just mentioning some of the steps I had to go
   through.

2) The iovec. Fine if you have the above data structure, as you can just
   alloc/realloc -> free when done. I just took the easy path and made
   the iovec big enough (eg ring size).

Outside of that, didn't need to make a lot of changes:

 1 file changed, 39 insertions(+), 17 deletions(-)

Performance is great, because we get to pass in N (in my case ~65)
packets per send. No more per packet locking. Which I do think
highlights that we can do better on the multishot send/sendmsg by
grabbing buffers upfront and passing them in in one go rather than
simply loop around calling tcp_sendmsg_locked() for each one.

Anyway, something to look into!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-28 23:49                               ` Jens Axboe
@ 2024-02-29  1:46                                 ` Jens Axboe
  2024-02-29 15:42                                   ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2024-02-29  1:46 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/28/24 4:49 PM, Jens Axboe wrote:
> On 2/28/24 10:28 AM, Jens Axboe wrote:
>> When I have some time I can do add the append case, or feel free to do
>> that yourself, and I can run some testing with that too.
> 
> I did a quick hack to add the append mode, and by default we get roughly
> ring_size / 2 number of appended vecs, which I guess is expected.
> There's a few complications there:
> 
> 1) We basically need a per-send data structure at this point. While
>    that's not hard to do, at least for my case I'd need to add that just
>    for this case and everything would now need to do it. Perhaps. You
>    can perhaps steal a few low bits and only do it for sendmsg. But why?
>    Because now you have multiple buffer IDs in a send completion, and
>    you need to be able to unravel it. If we always receive and send in
>    order, then it'd always been contiguous, which I took advantage of.
>    Not a huge deal, just mentioning some of the steps I had to go
>    through.
> 
> 2) The iovec. Fine if you have the above data structure, as you can just
>    alloc/realloc -> free when done. I just took the easy path and made
>    the iovec big enough (eg ring size).
> 
> Outside of that, didn't need to make a lot of changes:
> 
>  1 file changed, 39 insertions(+), 17 deletions(-)
> 
> Performance is great, because we get to pass in N (in my case ~65)
> packets per send. No more per packet locking. Which I do think
> highlights that we can do better on the multishot send/sendmsg by
> grabbing buffers upfront and passing them in in one go rather than
> simply loop around calling tcp_sendmsg_locked() for each one.

In terms of absolute numbers, previous best times were multishot send,
with ran in 3125 usec. Using either the above approach, or a hacked up
version of multishot send that uses provided buffers and bundles them
into one send (ala sendmsg), the runtimes are within 1% of each other
and too close to call. But the runtime is around 2320, or aroudn 25%
faster than doing one issue at the time.

This is using the same small packet size of 32 bytes. Just did the
bundled send multishot thing to test, haven't tested more than that so
far.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/8] io_uring/net: support multishot for send
  2024-02-29  1:46                                 ` Jens Axboe
@ 2024-02-29 15:42                                   ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2024-02-29 15:42 UTC (permalink / raw)
  To: Pavel Begunkov, Dylan Yudaken; +Cc: io-uring

On 2/28/24 6:46 PM, Jens Axboe wrote:
> On 2/28/24 4:49 PM, Jens Axboe wrote:
>> On 2/28/24 10:28 AM, Jens Axboe wrote:
>>> When I have some time I can do add the append case, or feel free to do
>>> that yourself, and I can run some testing with that too.
>>
>> I did a quick hack to add the append mode, and by default we get roughly
>> ring_size / 2 number of appended vecs, which I guess is expected.
>> There's a few complications there:
>>
>> 1) We basically need a per-send data structure at this point. While
>>    that's not hard to do, at least for my case I'd need to add that just
>>    for this case and everything would now need to do it. Perhaps. You
>>    can perhaps steal a few low bits and only do it for sendmsg. But why?
>>    Because now you have multiple buffer IDs in a send completion, and
>>    you need to be able to unravel it. If we always receive and send in
>>    order, then it'd always been contiguous, which I took advantage of.
>>    Not a huge deal, just mentioning some of the steps I had to go
>>    through.
>>
>> 2) The iovec. Fine if you have the above data structure, as you can just
>>    alloc/realloc -> free when done. I just took the easy path and made
>>    the iovec big enough (eg ring size).
>>
>> Outside of that, didn't need to make a lot of changes:
>>
>>  1 file changed, 39 insertions(+), 17 deletions(-)
>>
>> Performance is great, because we get to pass in N (in my case ~65)
>> packets per send. No more per packet locking. Which I do think
>> highlights that we can do better on the multishot send/sendmsg by
>> grabbing buffers upfront and passing them in in one go rather than
>> simply loop around calling tcp_sendmsg_locked() for each one.
> 
> In terms of absolute numbers, previous best times were multishot send,
> with ran in 3125 usec. Using either the above approach, or a hacked up
> version of multishot send that uses provided buffers and bundles them
> into one send (ala sendmsg), the runtimes are within 1% of each other
> and too close to call. But the runtime is around 2320, or aroudn 25%
> faster than doing one issue at the time.
> 
> This is using the same small packet size of 32 bytes. Just did the
> bundled send multishot thing to test, haven't tested more than that so
> far.

Update: I had a bug in the sendmsg with multiple vecs, it did not have a
single msg inflight at the same time, it could be multiple. Which
obviously can't work. That voids the sendmsg append results for now.
Will fiddle with it a bit and get it working, and post the full results
when I have them.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2024-02-29 15:42 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-02-25  0:35 [PATCHSET v3 0/8] Support for provided buffers for send Jens Axboe
2024-02-25  0:35 ` [PATCH 1/8] io_uring/net: unify how recvmsg and sendmsg copy in the msghdr Jens Axboe
2024-02-26 14:41   ` Pavel Begunkov
2024-02-26 15:03     ` Jens Axboe
2024-02-25  0:35 ` [PATCH 2/8] net: remove {revc,send}msg_copy_msghdr() from exports Jens Axboe
2024-02-25  0:35 ` [PATCH 3/8] io_uring/net: add provided buffer support for IORING_OP_SEND Jens Axboe
2024-02-25  0:35 ` [PATCH 4/8] io_uring/net: add provided buffer support for IORING_OP_SENDMSG Jens Axboe
2024-02-25  0:35 ` [PATCH 5/8] io_uring/kbuf: flag request if buffer pool is empty after buffer pick Jens Axboe
2024-02-25  0:35 ` [PATCH 6/8] io_uring/net: support multishot for send Jens Axboe
2024-02-26 10:47   ` Dylan Yudaken
2024-02-26 13:38     ` Jens Axboe
2024-02-26 14:02       ` Dylan Yudaken
2024-02-26 14:27         ` Jens Axboe
2024-02-26 14:36           ` Pavel Begunkov
2024-02-26 15:16             ` Jens Axboe
2024-02-26 15:41               ` Pavel Begunkov
2024-02-26 19:11                 ` Jens Axboe
2024-02-26 19:21                   ` Pavel Begunkov
2024-02-26 20:12                     ` Jens Axboe
2024-02-26 20:51                       ` Pavel Begunkov
2024-02-26 21:27                         ` Jens Axboe
2024-02-28 12:39                           ` Pavel Begunkov
2024-02-28 17:28                             ` Jens Axboe
2024-02-28 23:49                               ` Jens Axboe
2024-02-29  1:46                                 ` Jens Axboe
2024-02-29 15:42                                   ` Jens Axboe
2024-02-26 19:31           ` Dylan Yudaken
2024-02-26 19:49             ` Jens Axboe
2024-02-25  0:35 ` [PATCH 7/8] io_uring/net: support multishot for sendmsg Jens Axboe
2024-02-25  0:35 ` [PATCH 8/8] io_uring/net: set MSG_MORE if we're doing multishot send and have more Jens Axboe
2024-02-26 10:59   ` Dylan Yudaken
2024-02-26 13:42     ` Jens Axboe
2024-02-26 14:24       ` Pavel Begunkov
2024-02-26 14:52         ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox