public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCHSET RFC 0/4] Coalesce provided buffer segments
@ 2024-08-12 16:55 Jens Axboe
  2024-08-12 16:55 ` [PATCH 1/4] io_uring/kbuf: have io_provided_buffers_select() take buf_sel_arg Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Jens Axboe @ 2024-08-12 16:55 UTC (permalink / raw)
  To: io-uring

Hi,

When selecting provided buffers for a send/recv for bundles, there's
no reason why the number of buffers selected is the same as the mapped
segments that will be passed to send/recv. If some (or all) of these
buffers are virtually contigious, then they can get collapsed into much
fewer segments. Sometimes even just a single segment. This avoids costly
iteration on the send/recv processing side.

The return value is the number of bytes sent/received, and the starting
buffer ID where the operation begun. This is again identical to how
bundles work, from the application point of view this doesn't change
anything in terms of how send/recv bundles are handled, hence this is
a transparent feature.

Patch 1-3 are just basic prep patches, and patch 4 allows for actual
coalescing of segments. This is only enabled for bundles, as those are
the types of requests that process multiple buffers in a single
operation.

Patches are on top of 6.11-rc3 with pending io_uring patches, as well
as the incremental buffer consumption patches [1] posted earlier today.

 io_uring/kbuf.c | 71 ++++++++++++++++++++++++++++++++++++++++++-------
 io_uring/kbuf.h |  7 +++--
 io_uring/net.c  | 55 +++++++++++++++++---------------------
 io_uring/net.h  |  1 +
 4 files changed, 91 insertions(+), 43 deletions(-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/4] io_uring/kbuf: have io_provided_buffers_select() take buf_sel_arg
  2024-08-12 16:55 [PATCHSET RFC 0/4] Coalesce provided buffer segments Jens Axboe
@ 2024-08-12 16:55 ` Jens Axboe
  2024-08-12 16:55 ` [PATCH 2/4] io_uring/net: pass in io_kiocb to io_bundle_nbufs() Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2024-08-12 16:55 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Rather than pass in the iovec in both spots, pass in the buf_sel_arg
struct pointer directly. In preparation for needing more of this
selection struct off that path.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 793b2454acca..794a687d8589 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -119,7 +119,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 
 static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
 				      struct io_buffer_list *bl,
-				      struct iovec *iov)
+				      struct buf_sel_arg *arg)
 {
 	void __user *buf;
 
@@ -127,8 +127,8 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
 	if (unlikely(!buf))
 		return -ENOBUFS;
 
-	iov[0].iov_base = buf;
-	iov[0].iov_len = *len;
+	arg->iovs[0].iov_base = buf;
+	arg->iovs[0].iov_len = *len;
 	return 0;
 }
 
@@ -296,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 			bl->head += ret;
 		}
 	} else {
-		ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
+		ret = io_provided_buffers_select(req, &arg->out_len, bl, arg);
 	}
 out_unlock:
 	io_ring_submit_unlock(ctx, issue_flags);
@@ -323,7 +323,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
 	}
 
 	/* don't support multiple buffer selections for legacy */
-	return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
+	return io_provided_buffers_select(req, &arg->max_len, bl, arg);
 }
 
 static int __io_remove_buffers(struct io_ring_ctx *ctx,
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/4] io_uring/net: pass in io_kiocb to io_bundle_nbufs()
  2024-08-12 16:55 [PATCHSET RFC 0/4] Coalesce provided buffer segments Jens Axboe
  2024-08-12 16:55 ` [PATCH 1/4] io_uring/kbuf: have io_provided_buffers_select() take buf_sel_arg Jens Axboe
@ 2024-08-12 16:55 ` Jens Axboe
  2024-08-12 16:55 ` [PATCH 3/4] io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg Jens Axboe
  2024-08-12 16:55 ` [PATCH 4/4] io_uring/net: allow coalescing of mapped segments Jens Axboe
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2024-08-12 16:55 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

In preparation for needing the io_kiocb in there, and kmsg can always
be gotten off that in the first place.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/net.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index e312fc1ed7de..a6268e62b348 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -459,8 +459,9 @@ static void io_req_msg_cleanup(struct io_kiocb *req,
  * data in the iter, then loop the segments to figure out how much we
  * transferred.
  */
-static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
+static int io_bundle_nbufs(struct io_kiocb *req, int ret)
 {
+	struct io_async_msghdr *kmsg = req->async_data;
 	struct iovec *iov;
 	int nbufs;
 
@@ -503,7 +504,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
 		goto finish;
 	}
 
-	cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
+	cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(req, *ret), issue_flags);
 
 	if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
 		goto finish;
@@ -844,7 +845,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
-		cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
+		cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(req, *ret),
 				      issue_flags);
 		/* bundle with no more immediate buffers, we're done */
 		if (req->flags & REQ_F_BL_EMPTY)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/4] io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg
  2024-08-12 16:55 [PATCHSET RFC 0/4] Coalesce provided buffer segments Jens Axboe
  2024-08-12 16:55 ` [PATCH 1/4] io_uring/kbuf: have io_provided_buffers_select() take buf_sel_arg Jens Axboe
  2024-08-12 16:55 ` [PATCH 2/4] io_uring/net: pass in io_kiocb to io_bundle_nbufs() Jens Axboe
@ 2024-08-12 16:55 ` Jens Axboe
  2024-08-12 16:55 ` [PATCH 4/4] io_uring/net: allow coalescing of mapped segments Jens Axboe
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2024-08-12 16:55 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

nr_iovs is capped at 1024, and mode only has a few low values. We can
safely make them u16, in preparation for adding a few more members.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 1d56092d9286..5625ff0e349d 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -59,8 +59,8 @@ struct buf_sel_arg {
 	struct iovec *iovs;
 	size_t out_len;
 	size_t max_len;
-	int nr_iovs;
-	int mode;
+	unsigned short nr_iovs;
+	unsigned short mode;
 };
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/4] io_uring/net: allow coalescing of mapped segments
  2024-08-12 16:55 [PATCHSET RFC 0/4] Coalesce provided buffer segments Jens Axboe
                   ` (2 preceding siblings ...)
  2024-08-12 16:55 ` [PATCH 3/4] io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg Jens Axboe
@ 2024-08-12 16:55 ` Jens Axboe
  3 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2024-08-12 16:55 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For bundles, when multiple buffers are selected, it's not unlikely
that some/all of them will be virtually contigious. If these segments
aren't big, then nice wins can be reaped by coalescing them into
bigger segments. This makes networking copies more efficient, and
reduces the number of iterations that need to be done over an iovec.
Ideally, multiple segments that would've been mapped as an ITER_IOVEC
before can now be mapped into a single ITER_UBUF iterator.

Example from an io_uring network backend receiving data, with various
transfer sizes, over a 100G network link.

recv size    coalesce    threads    bw          cpu usage    bw diff
=====================================================================
64             0           1       23GB/sec       100%
64             1           1       46GB/sec        79%        +100%
64             0           4       81GB/sec       370%
64             1           4       96GB/sec       160%        + 20%
256            0           1       44GB/sec        90%
256            1           1       47GB/sec        48%        +  7%
256            0           4       90GB/sec       190%
256            1           4       96GB/sec       120%        +  7%
1024           0           1       49GB/sec        60%
1024           1           1       50GB/sec        53%        +  2%
1024           0           4       94GB/sec       140%
1024           1           4       96GB/sec       120%        +  2%

where obviously small buffer sizes benefit the most, but where an
efficiency gain is seen even at higher buffer sizes as well.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.c | 61 +++++++++++++++++++++++++++++++++++++++++++++----
 io_uring/kbuf.h |  3 +++
 io_uring/net.c  | 48 ++++++++++++++++----------------------
 io_uring/net.h  |  1 +
 4 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 794a687d8589..76eade5a5567 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -129,6 +129,7 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
 
 	arg->iovs[0].iov_base = buf;
 	arg->iovs[0].iov_len = *len;
+	arg->nsegs = 1;
 	return 0;
 }
 
@@ -194,11 +195,16 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 /* cap it at a reasonable 256, will be one page even for 4K */
 #define PEEK_MAX_IMPORT		256
 
+/*
+ * Returns how many iovecs were used to fill the range. arg->nsegs contains
+ * the number of buffers mapped, which may be less than the return value if
+ * segments were coalesced.
+ */
 static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 				struct io_buffer_list *bl)
 {
 	struct io_uring_buf_ring *br = bl->buf_ring;
-	struct iovec *iov = arg->iovs;
+	struct iovec *prev_iov, *iov = arg->iovs;
 	int nr_iovs = arg->nr_iovs;
 	__u16 nr_avail, tail, head;
 	struct io_uring_buf *buf;
@@ -239,9 +245,11 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 	if (!arg->max_len)
 		arg->max_len = INT_MAX;
 
+	prev_iov = NULL;
 	req->buf_index = buf->bid;
 	do {
 		int len = buf->len;
+		void __user *ubuf;
 
 		/* truncate end piece, if needed */
 		if (len > arg->max_len) {
@@ -250,10 +258,20 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 				buf->len = len;
 		}
 
-		iov->iov_base = u64_to_user_ptr(buf->addr);
-		iov->iov_len = len;
-		iov++;
+		ubuf = u64_to_user_ptr(buf->addr);
+		if (prev_iov &&
+		    prev_iov->iov_base + prev_iov->iov_len == ubuf &&
+		    prev_iov->iov_len + len <= INT_MAX) {
+			prev_iov->iov_len += len;
+		} else {
+			iov->iov_base = ubuf;
+			iov->iov_len = len;
+			if (arg->coalesce)
+				prev_iov = iov;
+			iov++;
+		}
 
+		arg->nsegs++;
 		arg->out_len += len;
 		arg->max_len -= len;
 		if (!arg->max_len)
@@ -266,7 +284,8 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 		req->flags |= REQ_F_BL_EMPTY;
 
 	req->flags |= REQ_F_BUFFER_RING;
-	req->buf_list = bl;
+	if (arg->coalesce)
+		req->buf_list = bl;
 	return iov - arg->iovs;
 }
 
@@ -326,6 +345,38 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
 	return io_provided_buffers_select(req, &arg->max_len, bl, arg);
 }
 
+int io_buffer_segments(struct io_kiocb *req, int nbytes)
+{
+	struct io_uring_buf_ring *br;
+	struct io_buffer_list *bl;
+	int nbufs = 0;
+	unsigned bid;
+
+	/*
+	 * Safe to use ->buf_list here, as coalescing can only have happened
+	 * if we remained lock throughout the operation. Unlocked usage must
+	 * not have buf_sel_arg->coalesce set to true
+	 */
+	bl = req->buf_list;
+	if (unlikely(!bl || !(bl->flags & IOBL_BUF_RING)))
+		return 1;
+
+	bid = req->buf_index;
+	br = bl->buf_ring;
+	do {
+		struct io_uring_buf *buf;
+		int this_len;
+
+		buf = io_ring_head_to_buf(br, bid, bl->mask);
+		this_len = min_t(int, buf->len, nbytes);
+		nbufs++;
+		bid++;
+		nbytes -= this_len;
+	} while (nbytes);
+
+	return nbufs;
+}
+
 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 			       struct io_buffer_list *bl, unsigned nbufs)
 {
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 5625ff0e349d..d6d5f936fe6f 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -61,8 +61,11 @@ struct buf_sel_arg {
 	size_t max_len;
 	unsigned short nr_iovs;
 	unsigned short mode;
+	unsigned short nsegs;
+	bool coalesce;
 };
 
+int io_buffer_segments(struct io_kiocb *req, int nbytes);
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 			      unsigned int issue_flags);
 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
diff --git a/io_uring/net.c b/io_uring/net.c
index a6268e62b348..f15671f5b118 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -462,33 +462,16 @@ static void io_req_msg_cleanup(struct io_kiocb *req,
 static int io_bundle_nbufs(struct io_kiocb *req, int ret)
 {
 	struct io_async_msghdr *kmsg = req->async_data;
-	struct iovec *iov;
-	int nbufs;
 
-	/* no data is always zero segments, and a ubuf is always 1 segment */
+	/* no data is always zero segments */
 	if (ret <= 0)
 		return 0;
-	if (iter_is_ubuf(&kmsg->msg.msg_iter))
-		return 1;
-
-	iov = kmsg->free_iov;
-	if (!iov)
-		iov = &kmsg->fast_iov;
-
-	/* if all data was transferred, it's basic pointer math */
+	/* if all data was transferred, we already know the number of buffers */
 	if (!iov_iter_count(&kmsg->msg.msg_iter))
-		return iter_iov(&kmsg->msg.msg_iter) - iov;
-
-	/* short transfer, count segments */
-	nbufs = 0;
-	do {
-		int this_len = min_t(int, iov[nbufs].iov_len, ret);
-
-		nbufs++;
-		ret -= this_len;
-	} while (ret);
+		return kmsg->nbufs;
 
-	return nbufs;
+	/* short transfer, iterate buffers to find number of segments */
+	return io_buffer_segments(req, ret);
 }
 
 static inline bool io_send_finish(struct io_kiocb *req, int *ret,
@@ -602,6 +585,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 			.iovs = &kmsg->fast_iov,
 			.max_len = INT_MAX,
 			.nr_iovs = 1,
+			.coalesce = !(issue_flags & IO_URING_F_UNLOCKED),
 		};
 
 		if (kmsg->free_iov) {
@@ -625,6 +609,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 			req->flags |= REQ_F_NEED_CLEANUP;
 		}
 		sr->len = arg.out_len;
+		kmsg->nbufs = arg.nsegs;
 
 		if (ret == 1) {
 			sr->buf = arg.iovs[0].iov_base;
@@ -1080,6 +1065,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 			.iovs = &kmsg->fast_iov,
 			.nr_iovs = 1,
 			.mode = KBUF_MODE_EXPAND,
+			.coalesce = true,
 		};
 
 		if (kmsg->free_iov) {
@@ -1095,7 +1081,18 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 		if (unlikely(ret < 0))
 			return ret;
 
-		/* special case 1 vec, can be a fast path */
+		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+			kmsg->free_iov_nr = arg.nsegs;
+			kmsg->free_iov = arg.iovs;
+			req->flags |= REQ_F_NEED_CLEANUP;
+		}
+		kmsg->nbufs = arg.nsegs;
+
+		/*
+		 * Special case 1 vec, can be a fast path. Note that multiple
+		 * contig buffers may get mapped to a single vec, but we can
+		 * still use ITER_UBUF for those.
+		 */
 		if (ret == 1) {
 			sr->buf = arg.iovs[0].iov_base;
 			sr->len = arg.iovs[0].iov_len;
@@ -1103,11 +1100,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 		}
 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
 				arg.out_len);
-		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
-			kmsg->free_iov_nr = ret;
-			kmsg->free_iov = arg.iovs;
-			req->flags |= REQ_F_NEED_CLEANUP;
-		}
 	} else {
 		void __user *buf;
 
diff --git a/io_uring/net.h b/io_uring/net.h
index 52bfee05f06a..b9a453da4a0f 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -9,6 +9,7 @@ struct io_async_msghdr {
 	/* points to an allocated iov, if NULL we use fast_iov instead */
 	struct iovec			*free_iov;
 	int				free_iov_nr;
+	int				nbufs;
 	int				namelen;
 	__kernel_size_t			controllen;
 	__kernel_size_t			payloadlen;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-08-12 17:01 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-12 16:55 [PATCHSET RFC 0/4] Coalesce provided buffer segments Jens Axboe
2024-08-12 16:55 ` [PATCH 1/4] io_uring/kbuf: have io_provided_buffers_select() take buf_sel_arg Jens Axboe
2024-08-12 16:55 ` [PATCH 2/4] io_uring/net: pass in io_kiocb to io_bundle_nbufs() Jens Axboe
2024-08-12 16:55 ` [PATCH 3/4] io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg Jens Axboe
2024-08-12 16:55 ` [PATCH 4/4] io_uring/net: allow coalescing of mapped segments Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox