public inbox for [email protected]
 help / color / mirror / Atom feed
From: Jens Axboe <[email protected]>
To: [email protected]
Cc: [email protected], [email protected], [email protected],
	Jens Axboe <[email protected]>
Subject: [PATCH 3/7] io_uring/kbuf: add helpers for getting/peeking multiple buffers
Date: Fri,  8 Mar 2024 16:34:08 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

Our provided buffer interface only allows selection of a single buffer.
Add an API that allows getting/peeking multiple buffers at the same time.

This is only implemented for the ring provided buffers. It could be added
for the legacy provided buffers as well, but since it's strongly
encouraged to use the new interface, let's keep it simpler and just
provide it for the new API. The legacy interface will always just select
a single buffer.

There are two new main functions:

io_buffers_select(), which selects up as many buffers as it can. The
caller supplies the iovec array, and io_buffers_select() may allocate
a bigger array if the 'out_len' being passed in is non-zero and bigger
than what we can fit in the provided iovec. Buffers grabbed with this
helper are permanently assigned.

io_buffers_peek(), which works like io_buffers_select(), except they can
be recycled, if needed. Callers using either of these functions should
call io_put_kbufs() rather than io_put_kbuf() at completion time. The
peek interface must be called with the ctx locked from peek to
completion.

This add a bit state for the request:

- REQ_F_BUFFERS_COMMIT, which means that the the buffers have been
  peeked and should be committed to the buffer ring head when they are
  put as part of completion. Prior to this, we used the fact that
  req->buf_list was cleared to NULL when committed. But with the peek
  interface requiring the ring to be locked throughout the operation,
  we can use that as a lookup cache instead.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |   3 +
 io_uring/kbuf.c                | 203 ++++++++++++++++++++++++++++++---
 io_uring/kbuf.h                |  39 +++++--
 3 files changed, 223 insertions(+), 22 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e24893625085..971294dfd22e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -481,6 +481,7 @@ enum {
 	REQ_F_CAN_POLL_BIT,
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
+	REQ_F_BUFFERS_COMMIT_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -559,6 +560,8 @@ enum {
 	REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
 	/* don't recycle provided buffers for this request */
 	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
+	/* buffer ring head needs incrementing on put */
+	REQ_F_BUFFERS_COMMIT	= IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9be42bff936b..921e8e25e027 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -140,34 +140,57 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 	return NULL;
 }
 
+static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
+				      struct io_buffer_list *bl,
+				      struct iovec *iov)
+{
+	void __user *buf;
+
+	buf = io_provided_buffer_select(req, len, bl);
+	if (unlikely(!buf))
+		return -ENOBUFS;
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = *len;
+	return 0;
+}
+
+static struct io_uring_buf *io_ring_head_to_buf(struct io_buffer_list *bl,
+						__u16 head)
+{
+	head &= bl->mask;
+
+	/* mmaped buffers are always contig */
+	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+		return &bl->buf_ring->bufs[head];
+	} else {
+		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
+		struct io_uring_buf *buf;
+
+		buf = page_address(bl->buf_pages[index]);
+		return buf + off;
+	}
+}
+
 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 					  struct io_buffer_list *bl,
 					  unsigned int issue_flags)
 {
-	struct io_uring_buf_ring *br = bl->buf_ring;
 	__u16 tail, head = bl->head;
 	struct io_uring_buf *buf;
 
-	tail = smp_load_acquire(&br->tail);
+	tail = smp_load_acquire(&bl->buf_ring->tail);
 	if (unlikely(tail == head))
 		return NULL;
 
 	if (head + 1 == tail)
 		req->flags |= REQ_F_BL_EMPTY;
 
-	head &= bl->mask;
-	/* mmaped buffers are always contig */
-	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
-		buf = &br->bufs[head];
-	} else {
-		int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
-		int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
-		buf = page_address(bl->buf_pages[index]);
-		buf += off;
-	}
+	buf = io_ring_head_to_buf(bl, head);
 	if (*len == 0 || *len > buf->len)
 		*len = buf->len;
-	req->flags |= REQ_F_BUFFER_RING;
+	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
 	req->buf_list = bl;
 	req->buf_index = buf->bid;
 
@@ -182,6 +205,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		 * the transfer completes (or if we get -EAGAIN and must poll of
 		 * retry).
 		 */
+		req->flags &= ~REQ_F_BUFFERS_COMMIT;
 		req->buf_list = NULL;
 		bl->head++;
 	}
@@ -208,6 +232,159 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
+static int io_ring_buffers_peek(struct io_kiocb *req, struct iovec **iovs,
+				int nr_iovs, size_t *out_len,
+				struct io_buffer_list *bl)
+{
+	struct iovec *iov = *iovs;
+	__u16 nr_avail, tail, head;
+	struct io_uring_buf *buf;
+	size_t max_len = 0;
+	int i;
+
+	if (*out_len) {
+		max_len = *out_len;
+		*out_len = 0;
+	}
+
+	tail = smp_load_acquire(&bl->buf_ring->tail);
+	head = bl->head;
+	nr_avail = tail - head;
+	if (unlikely(!nr_avail))
+		return -ENOBUFS;
+
+	buf = io_ring_head_to_buf(bl, head);
+	if (max_len) {
+		int needed;
+
+		needed = (max_len + buf->len - 1) / buf->len;
+		/* cap it at a reasonable 256, will be one page even for 4K */
+		needed = min(needed, 256);
+		if (nr_avail > needed)
+			nr_avail = needed;
+	}
+
+	if (nr_avail > UIO_MAXIOV)
+		nr_avail = UIO_MAXIOV;
+
+	/*
+	 * only alloc a bigger array if we know we have data to map, eg not
+	 * a speculative peek operation.
+	 */
+	if (nr_iovs == UIO_FASTIOV && nr_avail > nr_iovs && max_len) {
+		iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+		nr_iovs = nr_avail;
+	} else if (nr_avail < nr_iovs) {
+		nr_iovs = nr_avail;
+	}
+
+	buf = io_ring_head_to_buf(bl, head);
+	req->buf_index = buf->bid;
+
+	i = 0;
+	while (nr_iovs--) {
+		void __user *ubuf;
+
+		/* truncate end piece, if needed */
+		if (max_len && buf->len > max_len)
+			buf->len = max_len;
+
+		ubuf = u64_to_user_ptr(buf->addr);
+		if (!access_ok(ubuf, buf->len))
+			break;
+		iov[i].iov_base = ubuf;
+		iov[i].iov_len = buf->len;
+		*out_len += buf->len;
+		i++;
+		head++;
+		if (max_len) {
+			max_len -= buf->len;
+			if (!max_len)
+				break;
+		}
+		buf = io_ring_head_to_buf(bl, head);
+	}
+
+	if (head == tail)
+		req->flags |= REQ_F_BL_EMPTY;
+
+	if (i) {
+		req->flags |= REQ_F_BUFFER_RING;
+		*iovs = iov;
+		return i;
+	}
+
+	if (iov != *iovs)
+		kfree(iov);
+	*iovs = NULL;
+	return -EFAULT;
+}
+
+int io_buffers_select(struct io_kiocb *req, struct iovec **iovs, int nr_iovs,
+		      size_t *out_len, unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	int ret = -ENOENT;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	bl = io_buffer_get_list(ctx, req->buf_index);
+	if (unlikely(!bl))
+		goto out_unlock;
+
+	if (bl->is_mapped) {
+		ret = io_ring_buffers_peek(req, iovs, nr_iovs, out_len, bl);
+		/*
+		 * Don't recycle these buffers if we need to go through poll.
+		 * Nobody else can use them anyway, and holding on to provided
+		 * buffers for a send/write operation would happen on the app
+		 * side anyway with normal buffers. Besides, we already
+		 * committed them, they cannot be put back in the queue.
+		 */
+		req->buf_list = bl;
+		if (ret > 0) {
+			req->flags |= REQ_F_BL_NO_RECYCLE;
+			req->buf_list->head += ret;
+		}
+	} else {
+		ret = io_provided_buffers_select(req, out_len, bl, *iovs);
+	}
+out_unlock:
+	io_ring_submit_unlock(ctx, issue_flags);
+	return ret;
+}
+
+int io_buffers_peek(struct io_kiocb *req, struct iovec **iovs, int nr_iovs,
+		    size_t *out_len)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	int ret;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (req->buf_list) {
+		bl = req->buf_list;
+	} else {
+		bl = io_buffer_get_list(ctx, req->buf_index);
+		if (unlikely(!bl))
+			return -ENOENT;
+	}
+
+	/* don't support multiple buffer selections for legacy */
+	if (!bl->is_mapped)
+		return io_provided_buffers_select(req, out_len, bl, *iovs);
+
+	ret = io_ring_buffers_peek(req, iovs, nr_iovs, out_len, bl);
+	if (ret > 0) {
+		req->buf_list = bl;
+		req->flags |= REQ_F_BUFFERS_COMMIT;
+	}
+	return ret;
+}
+
 static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
 {
 	struct io_buffer_list *bl;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 5218bfd79e87..b4f48a144b73 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -43,6 +43,10 @@ struct io_buffer {
 
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 			      unsigned int issue_flags);
+int io_buffers_select(struct io_kiocb *req, struct iovec **iovs, int nr_iovs,
+		      size_t *out_len, unsigned int issue_flags);
+int io_buffers_peek(struct io_kiocb *req, struct iovec **iovs, int nr_iovs,
+		      size_t *out_len);
 void io_destroy_buffers(struct io_ring_ctx *ctx);
 
 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -74,7 +78,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 	 */
 	if (req->buf_list) {
 		req->buf_index = req->buf_list->bgid;
-		req->flags &= ~REQ_F_BUFFER_RING;
+		req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
 		return true;
 	}
 	return false;
@@ -98,11 +102,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	return false;
 }
 
-static inline void __io_put_kbuf_ring(struct io_kiocb *req)
+static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
 {
-	if (req->buf_list) {
-		req->buf_index = req->buf_list->bgid;
-		req->buf_list->head++;
+	struct io_buffer_list *bl = req->buf_list;
+
+	if (bl) {
+		if (req->flags & REQ_F_BUFFERS_COMMIT) {
+			bl->head += nr;
+			req->flags &= ~REQ_F_BUFFERS_COMMIT;
+		}
+		req->buf_index = bl->bgid;
 	}
 	req->flags &= ~REQ_F_BUFFER_RING;
 }
@@ -111,7 +120,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req,
 				      struct list_head *list)
 {
 	if (req->flags & REQ_F_BUFFER_RING) {
-		__io_put_kbuf_ring(req);
+		__io_put_kbuf_ring(req, 1);
 	} else {
 		req->buf_index = req->kbuf->bgid;
 		list_add(&req->kbuf->list, list);
@@ -133,8 +142,8 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
 	return ret;
 }
 
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
-				       unsigned issue_flags)
+static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
+					  unsigned issue_flags)
 {
 	unsigned int ret;
 
@@ -143,9 +152,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 
 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
 	if (req->flags & REQ_F_BUFFER_RING)
-		__io_put_kbuf_ring(req);
+		__io_put_kbuf_ring(req, nbufs);
 	else
 		__io_put_kbuf(req, issue_flags);
 	return ret;
 }
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+				       unsigned issue_flags)
+{
+	return __io_put_kbufs(req, 1, issue_flags);
+}
+
+static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
+					unsigned issue_flags)
+{
+	return __io_put_kbufs(req, nbufs, issue_flags);
+}
 #endif
-- 
2.43.0


  parent reply	other threads:[~2024-03-08 23:50 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-08 23:34 [PATCHSET RFC 0/7] Send and receive bundles Jens Axboe
2024-03-08 23:34 ` [PATCH 1/7] io_uring/net: add generic multishot retry helper Jens Axboe
2024-03-08 23:34 ` [PATCH 2/7] io_uring/net: add provided buffer support for IORING_OP_SEND Jens Axboe
2024-03-08 23:34 ` Jens Axboe [this message]
2024-03-08 23:34 ` [PATCH 4/7] io_uring/net: switch io_send() and io_send_zc() to using io_async_msghdr Jens Axboe
2024-03-08 23:34 ` [PATCH 5/7] io_uring/net: support bundles for send Jens Axboe
2024-03-08 23:34 ` [PATCH 6/7] io_uring/net: switch io_recv() to using io_async_msghdr Jens Axboe
2024-03-08 23:34 ` [PATCH 7/7] io_uring/net: support bundles for recv Jens Axboe
2024-03-10 18:15 ` [PATCHSET RFC 0/7] Send and receive bundles Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox