public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH 0/8] Add support for vectored registered buffers
@ 2025-03-03 15:50 Pavel Begunkov
  2025-03-03 15:50 ` [PATCH 1/8] io_uring: introduce struct iou_vec Pavel Begunkov
                   ` (9 more replies)
  0 siblings, 10 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:50 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Add registered buffer support for vectored io_uring operations. That
allows to pass an iovec, all entries of which must belong to and
point into the same registered buffer specified by sqe->buf_index.

The series covers zerocopy sendmsg and reads / writes. Reads and
writes are implemented as new opcodes, while zerocopy sendmsg
reuses IORING_RECVSEND_FIXED_BUF for the api.

Results are aligned to what one would expect from registered buffers:

t/io_uring + nullblk, single segment 16K:
  34 -> 46 GiB/s
examples/send-zerocopy.c default send size (64KB):
  82558 -> 123855 MB/s

The series is placed on top of 6.15 + zcrx.

Some tests:
https://github.com/isilence/liburing.git regbuf-import

Pavel Begunkov (8):
  io_uring: introduce struct iou_vec
  io_uring: add infra for importing vectored reg buffers
  io_uring/rw: implement vectored registered rw
  io_uring/rw: defer reg buf vec import
  io_uring/net: combine msghdr copy
  io_uring/net: pull vec alloc out of msghdr import
  io_uring/net: convert to struct iou_vec
  io_uring/net: implement vectored reg bufs for zctx

 include/linux/io_uring_types.h |  11 ++
 include/uapi/linux/io_uring.h  |   2 +
 io_uring/alloc_cache.h         |   9 --
 io_uring/net.c                 | 177 ++++++++++++++++++++-------------
 io_uring/net.h                 |   6 +-
 io_uring/opdef.c               |  39 ++++++++
 io_uring/rsrc.c                | 133 +++++++++++++++++++++++++
 io_uring/rsrc.h                |  22 ++++
 io_uring/rw.c                  |  96 ++++++++++++++++--
 io_uring/rw.h                  |   8 +-
 10 files changed, 411 insertions(+), 92 deletions(-)

-- 
2.48.1


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/8] io_uring: introduce struct iou_vec
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
@ 2025-03-03 15:50 ` Pavel Begunkov
  2025-03-03 18:23   ` Caleb Sander Mateos
  2025-03-03 15:50 ` [PATCH 2/8] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:50 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

I need a convenient way to pass around and work with iovec+size pair,
put them into a structure and makes use of it in rw.c

Signed-off-by: Pavel Begunkov <[email protected]>
---
 include/linux/io_uring_types.h |  5 +++++
 io_uring/rsrc.c                |  9 +++++++++
 io_uring/rsrc.h                | 17 +++++++++++++++++
 io_uring/rw.c                  | 17 +++++++----------
 io_uring/rw.h                  |  6 ++++--
 5 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 35fc241c4672..9101f12d21ef 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -110,6 +110,11 @@ struct io_uring_task {
 	} ____cacheline_aligned_in_smp;
 };
 
+struct iou_vec {
+	struct iovec		*iovec;
+	unsigned		nr;
+};
+
 struct io_uring {
 	u32 head;
 	u32 tail;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d6ac41840900..9b05e614819e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1264,3 +1264,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	fput(file);
 	return ret;
 }
+
+void io_vec_free(struct iou_vec *iv)
+{
+	if (!iv->iovec)
+		return;
+	kfree(iv->iovec);
+	iv->iovec = NULL;
+	iv->nr = 0;
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 662244282b2c..e3f1cfb2ff7b 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -3,6 +3,7 @@
 #define IOU_RSRC_H
 
 #include <linux/lockdep.h>
+#include <linux/io_uring_types.h>
 
 enum {
 	IORING_RSRC_FILE		= 0,
@@ -144,4 +145,20 @@ static inline void __io_unaccount_mem(struct user_struct *user,
 	atomic_long_sub(nr_pages, &user->locked_vm);
 }
 
+void io_vec_free(struct iou_vec *iv);
+
+static inline void io_vec_reset_iovec(struct iou_vec *iv,
+				      struct iovec *iovec, unsigned nr)
+{
+	io_vec_free(iv);
+	iv->iovec = iovec;
+	iv->nr = nr;
+}
+
+static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv)
+{
+	if (IS_ENABLED(CONFIG_KASAN))
+		io_vec_free(iv);
+}
+
 #endif
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 5ee9f8949e8b..ad7f647d48e9 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
 	int ret, nr_segs;
 	struct iovec *iov;
 
-	if (io->free_iovec) {
-		nr_segs = io->free_iov_nr;
-		iov = io->free_iovec;
+	if (io->vec.iovec) {
+		nr_segs = io->vec.nr;
+		iov = io->vec.iovec;
 	} else {
 		nr_segs = 1;
 		iov = &io->fast_iov;
@@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
 		return ret;
 	if (iov) {
 		req->flags |= REQ_F_NEED_CLEANUP;
-		io->free_iov_nr = io->iter.nr_segs;
-		kfree(io->free_iovec);
-		io->free_iovec = iov;
+		io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs);
 	}
 	return 0;
 }
@@ -151,7 +149,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
 		return;
 
-	io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr);
+	io_alloc_cache_vec_kasan(&rw->vec);
 	if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
 		req->async_data = NULL;
 		req->flags &= ~REQ_F_ASYNC_DATA;
@@ -201,7 +199,7 @@ static int io_rw_alloc_async(struct io_kiocb *req)
 	rw = io_uring_alloc_async_data(&ctx->rw_cache, req);
 	if (!rw)
 		return -ENOMEM;
-	if (rw->free_iovec)
+	if (rw->vec.iovec)
 		req->flags |= REQ_F_NEED_CLEANUP;
 	rw->bytes_done = 0;
 	return 0;
@@ -1327,7 +1325,6 @@ void io_rw_cache_free(const void *entry)
 {
 	struct io_async_rw *rw = (struct io_async_rw *) entry;
 
-	if (rw->free_iovec)
-		kfree(rw->free_iovec);
+	io_vec_free(&rw->vec);
 	kfree(rw);
 }
diff --git a/io_uring/rw.h b/io_uring/rw.h
index bf121b81ebe8..e86a3858f48b 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -3,19 +3,21 @@
 #include <linux/io_uring_types.h>
 #include <linux/pagemap.h>
 
+#include "rsrc.h"
+
 struct io_meta_state {
 	u32			seed;
 	struct iov_iter_state	iter_meta;
 };
 
 struct io_async_rw {
+	struct iou_vec			vec;
 	size_t				bytes_done;
-	struct iovec			*free_iovec;
+
 	struct_group(clear,
 		struct iov_iter			iter;
 		struct iov_iter_state		iter_state;
 		struct iovec			fast_iov;
-		int				free_iov_nr;
 		/*
 		 * wpq is for buffered io, while meta fields are used with
 		 * direct io
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/8] io_uring: add infra for importing vectored reg buffers
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
  2025-03-03 15:50 ` [PATCH 1/8] io_uring: introduce struct iou_vec Pavel Begunkov
@ 2025-03-03 15:50 ` Pavel Begunkov
  2025-03-03 20:49   ` Caleb Sander Mateos
  2025-03-03 15:50 ` [PATCH 3/8] io_uring/rw: implement vectored registered rw Pavel Begunkov
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:50 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Add io_import_reg_vec(), which will be responsible for importing
vectored registered buffers. iovecs are overlapped with the resulting
bvec in memory, which is why the iovec is expected to be padded in
iou_vec.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 include/linux/io_uring_types.h |   5 +-
 io_uring/rsrc.c                | 124 +++++++++++++++++++++++++++++++++
 io_uring/rsrc.h                |   5 ++
 3 files changed, 133 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9101f12d21ef..b770a2b12da6 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -111,7 +111,10 @@ struct io_uring_task {
 };
 
 struct iou_vec {
-	struct iovec		*iovec;
+	union {
+		struct iovec	*iovec;
+		struct bio_vec	*bvec;
+	};
 	unsigned		nr;
 };
 
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 9b05e614819e..1ec1f5b3e385 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1267,9 +1267,133 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 
 void io_vec_free(struct iou_vec *iv)
 {
+	BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec));
+
 	if (!iv->iovec)
 		return;
 	kfree(iv->iovec);
 	iv->iovec = NULL;
 	iv->nr = 0;
 }
+
+int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
+{
+	struct iovec *iov;
+
+	WARN_ON_ONCE(nr_entries <= 0);
+
+	iov = kmalloc_array(nr_entries, sizeof(iov[0]), GFP_KERNEL);
+	if (!iov)
+		return -ENOMEM;
+
+	io_vec_free(iv);
+	iv->iovec = iov;
+	iv->nr = nr_entries;
+	return 0;
+}
+
+static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
+				struct io_mapped_ubuf *imu,
+				struct iovec *iovec, int nr_iovs,
+				struct iou_vec *vec)
+{
+	unsigned long folio_size = (1 << imu->folio_shift);
+	unsigned long folio_mask = folio_size - 1;
+	struct bio_vec *res_bvec = vec->bvec;
+	size_t total_len = 0;
+	int bvec_idx = 0;
+	int iov_idx;
+
+	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
+		size_t iov_len = iovec[iov_idx].iov_len;
+		u64 buf_addr = (u64)iovec[iov_idx].iov_base;
+		u64 folio_addr = imu->ubuf & ~folio_mask;
+		struct bio_vec *src_bvec;
+		size_t offset;
+		u64 buf_end;
+
+		if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end)))
+			return -EFAULT;
+		if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
+			return -EFAULT;
+
+		total_len += iov_len;
+		/* by using folio address it also accounts for bvec offset */
+		offset = buf_addr - folio_addr;
+		src_bvec = imu->bvec + (offset >> imu->folio_shift);
+		offset &= folio_mask;
+
+		for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
+			size_t seg_size = min_t(size_t, iov_len,
+						folio_size - offset);
+
+			res_bvec[bvec_idx].bv_page = src_bvec->bv_page;
+			res_bvec[bvec_idx].bv_offset = offset;
+			res_bvec[bvec_idx].bv_len = seg_size;
+			iov_len -= seg_size;
+		}
+	}
+	if (total_len > MAX_RW_COUNT)
+		return -EINVAL;
+
+	iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
+	return 0;
+}
+
+static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
+				 struct io_mapped_ubuf *imu)
+{
+	unsigned shift = imu->folio_shift;
+	size_t max_segs = 0;
+	unsigned i;
+
+	for (i = 0; i < nr_iovs; i++)
+		max_segs += (iov[i].iov_len >> shift) + 2;
+	return max_segs;
+}
+
+int io_import_reg_vec(int ddir, struct iov_iter *iter,
+			struct io_kiocb *req, struct iou_vec *vec,
+			int nr_iovs, unsigned iovec_off,
+			unsigned issue_flags)
+{
+	struct io_rsrc_node *node;
+	struct io_mapped_ubuf *imu;
+	unsigned cache_nr;
+	struct iovec *iov;
+	unsigned nr_segs;
+	int ret;
+
+	node = io_find_buf_node(req, issue_flags);
+	if (!node)
+		return -EFAULT;
+	imu = node->buf;
+	if (imu->is_kbuf)
+		return -EOPNOTSUPP;
+
+	iov = vec->iovec + iovec_off;
+	ret = io_estimate_bvec_size(iov, nr_iovs, imu);
+	if (ret < 0)
+		return ret;
+	nr_segs = ret;
+	cache_nr = vec->nr;
+
+	if (WARN_ON_ONCE(iovec_off + nr_iovs != cache_nr) ||
+	    nr_segs > cache_nr) {
+		struct iou_vec tmp_vec = {};
+
+		ret = io_vec_realloc(&tmp_vec, nr_segs);
+		if (ret)
+			return ret;
+
+		iovec_off = tmp_vec.nr - nr_iovs;
+		memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
+		io_vec_free(vec);
+
+		*vec = tmp_vec;
+		iov = vec->iovec + iovec_off;
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
+
+	return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index e3f1cfb2ff7b..769ef5d76a4b 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -61,6 +61,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
 			u64 buf_addr, size_t len, int ddir,
 			unsigned issue_flags);
+int io_import_reg_vec(int ddir, struct iov_iter *iter,
+			struct io_kiocb *req, struct iou_vec *vec,
+			int nr_iovs, unsigned iovec_off,
+			unsigned issue_flags);
 
 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
@@ -146,6 +150,7 @@ static inline void __io_unaccount_mem(struct user_struct *user,
 }
 
 void io_vec_free(struct iou_vec *iv);
+int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries);
 
 static inline void io_vec_reset_iovec(struct iou_vec *iv,
 				      struct iovec *iovec, unsigned nr)
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/8] io_uring/rw: implement vectored registered rw
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
  2025-03-03 15:50 ` [PATCH 1/8] io_uring: introduce struct iou_vec Pavel Begunkov
  2025-03-03 15:50 ` [PATCH 2/8] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
@ 2025-03-03 15:50 ` Pavel Begunkov
  2025-03-03 23:01   ` Caleb Sander Mateos
  2025-03-03 15:50 ` [PATCH 4/8] io_uring/rw: defer reg buf vec import Pavel Begunkov
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:50 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Implement registered buffer vectored reads with new opcodes
IORING_OP_WRITEV_FIXED and IORING_OP_READV_FIXED.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/opdef.c              | 39 +++++++++++++++++++++++++++
 io_uring/rw.c                 | 51 +++++++++++++++++++++++++++++++++++
 io_uring/rw.h                 |  2 ++
 4 files changed, 94 insertions(+)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1e02e94bc26d..9dd384b369ee 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -280,6 +280,8 @@ enum io_uring_op {
 	IORING_OP_BIND,
 	IORING_OP_LISTEN,
 	IORING_OP_RECV_ZC,
+	IORING_OP_READV_FIXED,
+	IORING_OP_WRITEV_FIXED,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9511262c513e..6655d2cbf74d 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -529,6 +529,35 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READV_FIXED] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.vectored		= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_readv_fixed,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITEV_FIXED] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.vectored		= 1,
+		.async_size		= sizeof(struct io_async_rw),
+		.prep			= io_prep_writev_fixed,
+		.issue			= io_write,
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -761,6 +790,16 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_RECV_ZC] = {
 		.name			= "RECV_ZC",
 	},
+	[IORING_OP_READV_FIXED] = {
+		.name			= "READV_FIXED",
+		.cleanup		= io_readv_writev_cleanup,
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITEV_FIXED] = {
+		.name			= "WRITEV_FIXED",
+		.cleanup		= io_readv_writev_cleanup,
+		.fail			= io_rw_fail,
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index ad7f647d48e9..4c4229f41aaa 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -381,6 +381,57 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_prep_rw(req, sqe, ITER_SOURCE);
 }
 
+static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_async_rw *io = req->async_data;
+	const struct iovec __user *uvec;
+	size_t uvec_segs = rw->len;
+	struct iovec *iov;
+	int iovec_off, ret;
+	void *res;
+
+	if (uvec_segs > io->vec.nr) {
+		ret = io_vec_realloc(&io->vec, uvec_segs);
+		if (ret)
+			return ret;
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
+	/* pad iovec to the right */
+	iovec_off = io->vec.nr - uvec_segs;
+	iov = io->vec.iovec + iovec_off;
+	uvec = u64_to_user_ptr(rw->addr);
+	res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
+			      io_is_compat(req->ctx));
+	if (IS_ERR(res))
+		return PTR_ERR(res);
+
+	ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
+				uvec_segs, iovec_off, 0);
+	iov_iter_save_state(&io->iter, &io->iter_state);
+	return ret;
+}
+
+int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	int ret;
+
+	ret = __io_prep_rw(req, sqe, ITER_DEST);
+	if (unlikely(ret))
+		return ret;
+	return io_rw_prep_reg_vec(req, ITER_DEST);
+}
+
+int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	int ret;
+
+	ret = __io_prep_rw(req, sqe, ITER_SOURCE);
+	if (unlikely(ret))
+		return ret;
+	return io_rw_prep_reg_vec(req, ITER_SOURCE);
+}
+
 /*
  * Multishot read is prepared just like a normal read/write request, only
  * difference is that we set the MULTISHOT flag.
diff --git a/io_uring/rw.h b/io_uring/rw.h
index e86a3858f48b..475b6306a316 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -34,6 +34,8 @@ struct io_async_rw {
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/8] io_uring/rw: defer reg buf vec import
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (2 preceding siblings ...)
  2025-03-03 15:50 ` [PATCH 3/8] io_uring/rw: implement vectored registered rw Pavel Begunkov
@ 2025-03-03 15:50 ` Pavel Begunkov
  2025-03-03 23:37   ` Caleb Sander Mateos
  2025-03-03 15:51 ` [PATCH 5/8] io_uring/net: combine msghdr copy Pavel Begunkov
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:50 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Import registered buffers for vectored reads and writes later at issue
time as we now do for other fixed ops.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 include/linux/io_uring_types.h |  3 +++
 io_uring/rw.c                  | 36 +++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b770a2b12da6..d36fccda754b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -502,6 +502,7 @@ enum {
 	REQ_F_BUFFERS_COMMIT_BIT,
 	REQ_F_BUF_NODE_BIT,
 	REQ_F_HAS_METADATA_BIT,
+	REQ_F_IMPORT_BUFFER_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -584,6 +585,8 @@ enum {
 	REQ_F_BUF_NODE		= IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
 	/* request has read/write metadata assigned */
 	REQ_F_HAS_METADATA	= IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
+	/* resolve padded iovec to registered buffers */
+	REQ_F_IMPORT_BUFFER	= IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c4229f41aaa..33a7ab2a8664 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -381,6 +381,24 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_prep_rw(req, sqe, ITER_SOURCE);
 }
 
+static int io_rw_import_reg_vec(struct io_kiocb *req,
+				struct io_async_rw *io,
+				int ddir, unsigned int issue_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	unsigned uvec_segs = rw->len;
+	unsigned iovec_off = io->vec.nr - uvec_segs;
+	int ret;
+
+	ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
+				uvec_segs, iovec_off, issue_flags);
+	if (unlikely(ret))
+		return ret;
+	iov_iter_save_state(&io->iter, &io->iter_state);
+	req->flags &= ~REQ_F_IMPORT_BUFFER;
+	return 0;
+}
+
 static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -406,10 +424,8 @@ static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
 	if (IS_ERR(res))
 		return PTR_ERR(res);
 
-	ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
-				uvec_segs, iovec_off, 0);
-	iov_iter_save_state(&io->iter, &io->iter_state);
-	return ret;
+	req->flags |= REQ_F_IMPORT_BUFFER;
+	return 0;
 }
 
 int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -906,7 +922,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ssize_t ret;
 	loff_t *ppos;
 
-	if (io_do_buffer_select(req)) {
+	if (req->flags & REQ_F_IMPORT_BUFFER) {
+		ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags);
+		if (unlikely(ret))
+			return ret;
+	} else if (io_do_buffer_select(req)) {
 		ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags);
 		if (unlikely(ret < 0))
 			return ret;
@@ -1117,6 +1137,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ssize_t ret, ret2;
 	loff_t *ppos;
 
+	if (req->flags & REQ_F_IMPORT_BUFFER) {
+		ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags);
+		if (unlikely(ret))
+			return ret;
+	}
+
 	ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
 	if (unlikely(ret))
 		return ret;
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 5/8] io_uring/net: combine msghdr copy
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (3 preceding siblings ...)
  2025-03-03 15:50 ` [PATCH 4/8] io_uring/rw: defer reg buf vec import Pavel Begunkov
@ 2025-03-03 15:51 ` Pavel Begunkov
  2025-03-03 15:51 ` [PATCH 6/8] io_uring/net: pull vec alloc out of msghdr import Pavel Begunkov
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:51 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Call the compat version from inside of io_msg_copy_hdr() and don't
duplicate it in callers.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 io_uring/net.c | 46 +++++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 905d1ee01201..33076bd22c16 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -287,6 +287,24 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
 	struct user_msghdr __user *umsg = sr->umsg;
 	int ret;
 
+	iomsg->msg.msg_name = &iomsg->addr;
+	iomsg->msg.msg_iter.nr_segs = 0;
+
+	if (io_is_compat(req->ctx)) {
+		struct compat_msghdr cmsg;
+
+		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
+		if (ret)
+			return ret;
+
+		memset(&msg, 0, sizeof(msg));
+		msg->msg_namelen = cmsg.msg_namelen;
+		msg->msg_controllen = cmsg.msg_controllen;
+		msg->msg_iov = compat_ptr(cmsg.msg_iov);
+		msg->msg_iovlen = cmsg.msg_iovlen;
+		return 0;
+	}
+
 	ret = io_copy_msghdr_from_user(msg, umsg);
 	if (unlikely(ret))
 		return ret;
@@ -323,18 +341,6 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 	struct user_msghdr msg;
 	int ret;
 
-	iomsg->msg.msg_name = &iomsg->addr;
-	iomsg->msg.msg_iter.nr_segs = 0;
-
-	if (io_is_compat(req->ctx)) {
-		struct compat_msghdr cmsg;
-
-		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE,
-					     NULL);
-		sr->msg_control = iomsg->msg.msg_control_user;
-		return ret;
-	}
-
 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
 	/* save msg_control as sys_sendmsg() overwrites it */
 	sr->msg_control = iomsg->msg.msg_control_user;
@@ -710,21 +716,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 	struct user_msghdr msg;
 	int ret;
 
-	iomsg->msg.msg_name = &iomsg->addr;
-	iomsg->msg.msg_iter.nr_segs = 0;
-
-	if (io_is_compat(req->ctx)) {
-		struct compat_msghdr cmsg;
-
-		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST,
-					     &iomsg->uaddr);
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_namelen = cmsg.msg_namelen;
-		msg.msg_controllen = cmsg.msg_controllen;
-	} else {
-		ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
-	}
-
+	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
 	if (unlikely(ret))
 		return ret;
 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 6/8] io_uring/net: pull vec alloc out of msghdr import
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (4 preceding siblings ...)
  2025-03-03 15:51 ` [PATCH 5/8] io_uring/net: combine msghdr copy Pavel Begunkov
@ 2025-03-03 15:51 ` Pavel Begunkov
  2025-03-03 15:51 ` [PATCH 7/8] io_uring/net: convert to struct iou_vec Pavel Begunkov
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:51 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

I'll need more control over iovec management, move
io_net_import_vec() out of io_msg_copy_hdr().

Signed-off-by: Pavel Begunkov <[email protected]>
---
 io_uring/net.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 33076bd22c16..cbb889b85cfc 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -253,12 +253,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
 				return -EFAULT;
 			sr->len = tmp_iov.iov_len;
 		}
-
-		return 0;
 	}
-
-	return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov,
-				 msg->msg_iovlen, ddir);
+	return 0;
 }
 
 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
@@ -328,10 +324,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
 				return -EFAULT;
 			sr->len = tmp_iov.iov_len;
 		}
-		return 0;
 	}
-
-	return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir);
+	return 0;
 }
 
 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
@@ -342,6 +336,12 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 	int ret;
 
 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
+	if (unlikely(ret))
+		return ret;
+
+	if (!(req->flags & REQ_F_BUFFER_SELECT))
+		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
+					ITER_SOURCE);
 	/* save msg_control as sys_sendmsg() overwrites it */
 	sr->msg_control = iomsg->msg.msg_control_user;
 	return ret;
@@ -719,6 +719,13 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
 	if (unlikely(ret))
 		return ret;
+
+	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
+		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
+					ITER_DEST);
+		if (unlikely(ret))
+			return ret;
+	}
 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
 					msg.msg_controllen);
 }
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 7/8] io_uring/net: convert to struct iou_vec
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (5 preceding siblings ...)
  2025-03-03 15:51 ` [PATCH 6/8] io_uring/net: pull vec alloc out of msghdr import Pavel Begunkov
@ 2025-03-03 15:51 ` Pavel Begunkov
  2025-03-03 23:37   ` Caleb Sander Mateos
  2025-03-03 15:51 ` [PATCH 8/8] io_uring/net: implement vectored reg bufs for zctx Pavel Begunkov
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:51 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Convert net.c to use struct iou_vec.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 io_uring/alloc_cache.h |  9 --------
 io_uring/net.c         | 51 ++++++++++++++++++------------------------
 io_uring/net.h         |  6 ++---
 3 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 0dd17d8ba93a..7094d9d0bd29 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,
 
 void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);
 
-static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
-{
-	if (IS_ENABLED(CONFIG_KASAN)) {
-		kfree(*iov);
-		*iov = NULL;
-		*nr = 0;
-	}
-}
-
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 				      void *entry)
 {
diff --git a/io_uring/net.c b/io_uring/net.c
index cbb889b85cfc..a4b39343f345 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags)
 
 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
 {
-	if (kmsg->free_iov) {
-		kfree(kmsg->free_iov);
-		kmsg->free_iov_nr = 0;
-		kmsg->free_iov = NULL;
-	}
+	if (kmsg->vec.iovec)
+		io_vec_free(&kmsg->vec);
 }
 
 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
@@ -154,7 +151,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	/* Let normal cleanup path reap it if we fail adding to the cache */
-	io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr);
+	io_alloc_cache_vec_kasan(&hdr->vec);
 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
 		req->async_data = NULL;
 		req->flags &= ~REQ_F_ASYNC_DATA;
@@ -171,7 +168,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
 		return NULL;
 
 	/* If the async data was cached, we might have an iov cached inside. */
-	if (hdr->free_iov)
+	if (hdr->vec.iovec)
 		req->flags |= REQ_F_NEED_CLEANUP;
 	return hdr;
 }
@@ -182,10 +179,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg
 {
 	if (iov) {
 		req->flags |= REQ_F_NEED_CLEANUP;
-		kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
-		if (kmsg->free_iov)
-			kfree(kmsg->free_iov);
-		kmsg->free_iov = iov;
+		io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs);
 	}
 }
 
@@ -208,9 +202,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg
 	struct iovec *iov;
 	int ret, nr_segs;
 
-	if (iomsg->free_iov) {
-		nr_segs = iomsg->free_iov_nr;
-		iov = iomsg->free_iov;
+	if (iomsg->vec.iovec) {
+		nr_segs = iomsg->vec.nr;
+		iov = iomsg->vec.iovec;
 	} else {
 		nr_segs = 1;
 		iov = &iomsg->fast_iov;
@@ -468,7 +462,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
 		return 1;
 
-	iov = kmsg->free_iov;
+	iov = kmsg->vec.iovec;
 	if (!iov)
 		iov = &kmsg->fast_iov;
 
@@ -584,9 +578,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
 		.nr_iovs = 1,
 	};
 
-	if (kmsg->free_iov) {
-		arg.nr_iovs = kmsg->free_iov_nr;
-		arg.iovs = kmsg->free_iov;
+	if (kmsg->vec.iovec) {
+		arg.nr_iovs = kmsg->vec.nr;
+		arg.iovs = kmsg->vec.iovec;
 		arg.mode = KBUF_MODE_FREE;
 	}
 
@@ -599,9 +593,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
 	if (unlikely(ret < 0))
 		return ret;
 
-	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
-		kmsg->free_iov_nr = ret;
-		kmsg->free_iov = arg.iovs;
+	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+		kmsg->vec.nr = ret;
+		kmsg->vec.iovec = arg.iovs;
 		req->flags |= REQ_F_NEED_CLEANUP;
 	}
 	sr->len = arg.out_len;
@@ -1085,9 +1079,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 			.mode = KBUF_MODE_EXPAND,
 		};
 
-		if (kmsg->free_iov) {
-			arg.nr_iovs = kmsg->free_iov_nr;
-			arg.iovs = kmsg->free_iov;
+		if (kmsg->vec.iovec) {
+			arg.nr_iovs = kmsg->vec.nr;
+			arg.iovs = kmsg->vec.iovec;
 			arg.mode |= KBUF_MODE_FREE;
 		}
 
@@ -1106,9 +1100,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 		}
 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
 				arg.out_len);
-		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
-			kmsg->free_iov_nr = ret;
-			kmsg->free_iov = arg.iovs;
+		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+			kmsg->vec.nr = ret;
+			kmsg->vec.iovec = arg.iovs;
 			req->flags |= REQ_F_NEED_CLEANUP;
 		}
 	} else {
@@ -1874,8 +1868,7 @@ void io_netmsg_cache_free(const void *entry)
 {
 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
 
-	if (kmsg->free_iov)
-		io_netmsg_iovec_free(kmsg);
+	io_vec_free(&kmsg->vec);
 	kfree(kmsg);
 }
 #endif
diff --git a/io_uring/net.h b/io_uring/net.h
index b804c2b36e60..43e5ce5416b7 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -2,12 +2,12 @@
 
 #include <linux/net.h>
 #include <linux/uio.h>
+#include <linux/io_uring_types.h>
 
 struct io_async_msghdr {
 #if defined(CONFIG_NET)
-	struct iovec			*free_iov;
-	/* points to an allocated iov, if NULL we use fast_iov instead */
-	int				free_iov_nr;
+	struct iou_vec				vec;
+
 	struct_group(clear,
 		int				namelen;
 		struct iovec			fast_iov;
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 8/8] io_uring/net: implement vectored reg bufs for zctx
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (6 preceding siblings ...)
  2025-03-03 15:51 ` [PATCH 7/8] io_uring/net: convert to struct iou_vec Pavel Begunkov
@ 2025-03-03 15:51 ` Pavel Begunkov
  2025-03-03 21:03 ` [PATCH 0/8] Add support for vectored registered buffers Andres Freund
  2025-03-04  0:34 ` Caleb Sander Mateos
  9 siblings, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-03 15:51 UTC (permalink / raw)
  To: io-uring; +Cc: asml.silence, Andres Freund

Add support for vectored registered buffers for send zc.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 io_uring/net.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 4 deletions(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index a4b39343f345..5e27c22e1d58 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -395,6 +395,44 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	return io_sendmsg_copy_hdr(req, kmsg);
 }
 
+static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+	struct io_async_msghdr *kmsg = req->async_data;
+	struct user_msghdr msg;
+	int ret, iovec_off;
+	struct iovec *iov;
+	void *res;
+
+	if (!(sr->flags & IORING_RECVSEND_FIXED_BUF))
+		return io_sendmsg_setup(req, sqe);
+
+	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
+	if (unlikely(ret))
+		return ret;
+	sr->msg_control = kmsg->msg.msg_control_user;
+
+	if (msg.msg_iovlen > kmsg->vec.nr || WARN_ON_ONCE(!kmsg->vec.iovec)) {
+		ret = io_vec_realloc(&kmsg->vec, msg.msg_iovlen);
+		if (ret)
+			return ret;
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
+	iovec_off = kmsg->vec.nr - msg.msg_iovlen;
+	iov = kmsg->vec.iovec + iovec_off;
+
+	res = iovec_from_user(msg.msg_iov, msg.msg_iovlen, kmsg->vec.nr, iov,
+			      io_is_compat(req->ctx));
+	if (IS_ERR(res))
+		return PTR_ERR(res);
+
+	kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
+	req->flags |= REQ_F_IMPORT_BUFFER;
+	return ret;
+}
+
 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
 
 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1333,8 +1371,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->opcode != IORING_OP_SEND_ZC) {
 		if (unlikely(sqe->addr2 || sqe->file_index))
 			return -EINVAL;
-		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
-			return -EINVAL;
 	}
 
 	zc->len = READ_ONCE(sqe->len);
@@ -1350,7 +1386,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -ENOMEM;
 	if (req->opcode != IORING_OP_SENDMSG_ZC)
 		return io_send_setup(req, sqe);
-	return io_sendmsg_setup(req, sqe);
+	return io_sendmsg_zc_setup(req, sqe);
 }
 
 static int io_sg_from_iter_iovec(struct sk_buff *skb,
@@ -1506,6 +1542,22 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned flags;
 	int ret, min_ret = 0;
 
+	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+
+	if (req->flags & REQ_F_IMPORT_BUFFER) {
+		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
+		unsigned iovec_off = kmsg->vec.nr - uvec_segs;
+		int ret;
+
+		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req,
+					&kmsg->vec, uvec_segs, iovec_off,
+					issue_flags);
+		if (unlikely(ret))
+			return ret;
+		kmsg->msg.sg_from_iter = io_sg_from_iter;
+		req->flags &= ~REQ_F_IMPORT_BUFFER;
+	}
+
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
@@ -1524,7 +1576,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
 
 	kmsg->msg.msg_control_user = sr->msg_control;
 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
-	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
 
 	if (unlikely(ret < min_ret)) {
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/8] io_uring: introduce struct iou_vec
  2025-03-03 15:50 ` [PATCH 1/8] io_uring: introduce struct iou_vec Pavel Begunkov
@ 2025-03-03 18:23   ` Caleb Sander Mateos
  0 siblings, 0 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-03 18:23 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 7:50 AM Pavel Begunkov <[email protected]> wrote:
>
> I need a convenient way to pass around and work with iovec+size pair,
> put them into a structure and makes use of it in rw.c
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  include/linux/io_uring_types.h |  5 +++++
>  io_uring/rsrc.c                |  9 +++++++++
>  io_uring/rsrc.h                | 17 +++++++++++++++++
>  io_uring/rw.c                  | 17 +++++++----------
>  io_uring/rw.h                  |  6 ++++--
>  5 files changed, 42 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 35fc241c4672..9101f12d21ef 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -110,6 +110,11 @@ struct io_uring_task {
>         } ____cacheline_aligned_in_smp;
>  };
>
> +struct iou_vec {
> +       struct iovec            *iovec;
> +       unsigned                nr;
> +};
> +
>  struct io_uring {
>         u32 head;
>         u32 tail;
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index d6ac41840900..9b05e614819e 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1264,3 +1264,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
>         fput(file);
>         return ret;
>  }
> +
> +void io_vec_free(struct iou_vec *iv)
> +{
> +       if (!iv->iovec)
> +               return;
> +       kfree(iv->iovec);
> +       iv->iovec = NULL;
> +       iv->nr = 0;
> +}
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 662244282b2c..e3f1cfb2ff7b 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -3,6 +3,7 @@
>  #define IOU_RSRC_H
>
>  #include <linux/lockdep.h>
> +#include <linux/io_uring_types.h>

I sent out a separate patch for this a couple days ago:
https://lore.kernel.org/io-uring/[email protected]/T/#u

>
>  enum {
>         IORING_RSRC_FILE                = 0,
> @@ -144,4 +145,20 @@ static inline void __io_unaccount_mem(struct user_struct *user,
>         atomic_long_sub(nr_pages, &user->locked_vm);
>  }
>
> +void io_vec_free(struct iou_vec *iv);
> +
> +static inline void io_vec_reset_iovec(struct iou_vec *iv,
> +                                     struct iovec *iovec, unsigned nr)
> +{
> +       io_vec_free(iv);
> +       iv->iovec = iovec;
> +       iv->nr = nr;
> +}
> +
> +static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv)
> +{
> +       if (IS_ENABLED(CONFIG_KASAN))
> +               io_vec_free(iv);
> +}
> +
>  #endif
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 5ee9f8949e8b..ad7f647d48e9 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
>         int ret, nr_segs;
>         struct iovec *iov;
>
> -       if (io->free_iovec) {
> -               nr_segs = io->free_iov_nr;
> -               iov = io->free_iovec;
> +       if (io->vec.iovec) {
> +               nr_segs = io->vec.nr;
> +               iov = io->vec.iovec;
>         } else {
>                 nr_segs = 1;
>                 iov = &io->fast_iov;
> @@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
>                 return ret;
>         if (iov) {
>                 req->flags |= REQ_F_NEED_CLEANUP;
> -               io->free_iov_nr = io->iter.nr_segs;
> -               kfree(io->free_iovec);
> -               io->free_iovec = iov;
> +               io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs);
>         }
>         return 0;
>  }
> @@ -151,7 +149,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
>         if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
>                 return;
>
> -       io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr);
> +       io_alloc_cache_vec_kasan(&rw->vec);
>         if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
>                 req->async_data = NULL;
>                 req->flags &= ~REQ_F_ASYNC_DATA;
> @@ -201,7 +199,7 @@ static int io_rw_alloc_async(struct io_kiocb *req)
>         rw = io_uring_alloc_async_data(&ctx->rw_cache, req);
>         if (!rw)
>                 return -ENOMEM;
> -       if (rw->free_iovec)
> +       if (rw->vec.iovec)
>                 req->flags |= REQ_F_NEED_CLEANUP;
>         rw->bytes_done = 0;
>         return 0;
> @@ -1327,7 +1325,6 @@ void io_rw_cache_free(const void *entry)
>  {
>         struct io_async_rw *rw = (struct io_async_rw *) entry;
>
> -       if (rw->free_iovec)
> -               kfree(rw->free_iovec);
> +       io_vec_free(&rw->vec);
>         kfree(rw);
>  }
> diff --git a/io_uring/rw.h b/io_uring/rw.h
> index bf121b81ebe8..e86a3858f48b 100644
> --- a/io_uring/rw.h
> +++ b/io_uring/rw.h
> @@ -3,19 +3,21 @@
>  #include <linux/io_uring_types.h>
>  #include <linux/pagemap.h>
>
> +#include "rsrc.h"

Why is this include necessary? struct iou_vec is defined in
io_uring_types.h. Seems like the include would make more sense in
rw.c.

Best,
Caleb

> +
>  struct io_meta_state {
>         u32                     seed;
>         struct iov_iter_state   iter_meta;
>  };
>
>  struct io_async_rw {
> +       struct iou_vec                  vec;
>         size_t                          bytes_done;
> -       struct iovec                    *free_iovec;
> +
>         struct_group(clear,
>                 struct iov_iter                 iter;
>                 struct iov_iter_state           iter_state;
>                 struct iovec                    fast_iov;
> -               int                             free_iov_nr;
>                 /*
>                  * wpq is for buffered io, while meta fields are used with
>                  * direct io
> --
> 2.48.1
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/8] io_uring: add infra for importing vectored reg buffers
  2025-03-03 15:50 ` [PATCH 2/8] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
@ 2025-03-03 20:49   ` Caleb Sander Mateos
  2025-03-03 20:57     ` Keith Busch
  2025-03-04 10:05     ` Pavel Begunkov
  0 siblings, 2 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-03 20:49 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <[email protected]> wrote:
>
> Add io_import_reg_vec(), which will be responsible for importing
> vectored registered buffers. iovecs are overlapped with the resulting
> bvec in memory, which is why the iovec is expected to be padded in
> iou_vec.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  include/linux/io_uring_types.h |   5 +-
>  io_uring/rsrc.c                | 124 +++++++++++++++++++++++++++++++++
>  io_uring/rsrc.h                |   5 ++
>  3 files changed, 133 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 9101f12d21ef..b770a2b12da6 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -111,7 +111,10 @@ struct io_uring_task {
>  };
>
>  struct iou_vec {
> -       struct iovec            *iovec;
> +       union {
> +               struct iovec    *iovec;
> +               struct bio_vec  *bvec;
> +       };

If I understand correctly, io_import_reg_vec() converts the iovecs to
bio_vecs in place. If an iovec expands to more than one bio_vec (i.e.
crosses a folio boundary), wouldn't the bio_vecs overwrite iovecs that
hadn't been processed yet?

>         unsigned                nr;
>  };
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 9b05e614819e..1ec1f5b3e385 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1267,9 +1267,133 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
>
>  void io_vec_free(struct iou_vec *iv)
>  {
> +       BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec));
> +
>         if (!iv->iovec)
>                 return;
>         kfree(iv->iovec);
>         iv->iovec = NULL;
>         iv->nr = 0;
>  }
> +
> +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
> +{
> +       struct iovec *iov;
> +
> +       WARN_ON_ONCE(nr_entries <= 0);
> +
> +       iov = kmalloc_array(nr_entries, sizeof(iov[0]), GFP_KERNEL);
> +       if (!iov)
> +               return -ENOMEM;
> +
> +       io_vec_free(iv);
> +       iv->iovec = iov;
> +       iv->nr = nr_entries;
> +       return 0;
> +}
> +
> +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
> +                               struct io_mapped_ubuf *imu,
> +                               struct iovec *iovec, int nr_iovs,
> +                               struct iou_vec *vec)
> +{
> +       unsigned long folio_size = (1 << imu->folio_shift);
> +       unsigned long folio_mask = folio_size - 1;
> +       struct bio_vec *res_bvec = vec->bvec;
> +       size_t total_len = 0;
> +       int bvec_idx = 0;
> +       int iov_idx;
> +
> +       for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
> +               size_t iov_len = iovec[iov_idx].iov_len;
> +               u64 buf_addr = (u64)iovec[iov_idx].iov_base;
> +               u64 folio_addr = imu->ubuf & ~folio_mask;

The computation of folio_addr could be moved out of the loop.

> +               struct bio_vec *src_bvec;
> +               size_t offset;
> +               u64 buf_end;
> +
> +               if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end)))
> +                       return -EFAULT;
> +               if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
> +                       return -EFAULT;
> +
> +               total_len += iov_len;
> +               /* by using folio address it also accounts for bvec offset */
> +               offset = buf_addr - folio_addr;
> +               src_bvec = imu->bvec + (offset >> imu->folio_shift);
> +               offset &= folio_mask;
> +
> +               for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
> +                       size_t seg_size = min_t(size_t, iov_len,
> +                                               folio_size - offset);
> +
> +                       res_bvec[bvec_idx].bv_page = src_bvec->bv_page;
> +                       res_bvec[bvec_idx].bv_offset = offset;
> +                       res_bvec[bvec_idx].bv_len = seg_size;

Could just increment res_bvec to avoid the variable bvec_idx?

> +                       iov_len -= seg_size;
> +               }
> +       }
> +       if (total_len > MAX_RW_COUNT)
> +               return -EINVAL;
> +
> +       iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
> +       return 0;
> +}
> +
> +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
> +                                struct io_mapped_ubuf *imu)
> +{
> +       unsigned shift = imu->folio_shift;
> +       size_t max_segs = 0;
> +       unsigned i;
> +
> +       for (i = 0; i < nr_iovs; i++)
> +               max_segs += (iov[i].iov_len >> shift) + 2;

Sees like this may overestimate a bit. I think something like this
would give the exact number of segments for each iovec?
(((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift

> +       return max_segs;
> +}
> +
> +int io_import_reg_vec(int ddir, struct iov_iter *iter,
> +                       struct io_kiocb *req, struct iou_vec *vec,
> +                       int nr_iovs, unsigned iovec_off,
> +                       unsigned issue_flags)
> +{
> +       struct io_rsrc_node *node;
> +       struct io_mapped_ubuf *imu;
> +       unsigned cache_nr;
> +       struct iovec *iov;
> +       unsigned nr_segs;
> +       int ret;
> +
> +       node = io_find_buf_node(req, issue_flags);
> +       if (!node)
> +               return -EFAULT;
> +       imu = node->buf;
> +       if (imu->is_kbuf)
> +               return -EOPNOTSUPP;
> +
> +       iov = vec->iovec + iovec_off;
> +       ret = io_estimate_bvec_size(iov, nr_iovs, imu);
> +       if (ret < 0)
> +               return ret;

io_estimate_bvec_size() doesn't (intentionally) return an error code,
just an unsigned value cast to an int.

Best,
Caleb

> +       nr_segs = ret;
> +       cache_nr = vec->nr;
> +
> +       if (WARN_ON_ONCE(iovec_off + nr_iovs != cache_nr) ||
> +           nr_segs > cache_nr) {
> +               struct iou_vec tmp_vec = {};
> +
> +               ret = io_vec_realloc(&tmp_vec, nr_segs);
> +               if (ret)
> +                       return ret;
> +
> +               iovec_off = tmp_vec.nr - nr_iovs;
> +               memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
> +               io_vec_free(vec);
> +
> +               *vec = tmp_vec;
> +               iov = vec->iovec + iovec_off;
> +               req->flags |= REQ_F_NEED_CLEANUP;
> +       }
> +
> +       return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
> +}
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index e3f1cfb2ff7b..769ef5d76a4b 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -61,6 +61,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
>  int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
>                         u64 buf_addr, size_t len, int ddir,
>                         unsigned issue_flags);
> +int io_import_reg_vec(int ddir, struct iov_iter *iter,
> +                       struct io_kiocb *req, struct iou_vec *vec,
> +                       int nr_iovs, unsigned iovec_off,
> +                       unsigned issue_flags);
>
>  int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
>  int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
> @@ -146,6 +150,7 @@ static inline void __io_unaccount_mem(struct user_struct *user,
>  }
>
>  void io_vec_free(struct iou_vec *iv);
> +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries);
>
>  static inline void io_vec_reset_iovec(struct iou_vec *iv,
>                                       struct iovec *iovec, unsigned nr)
> --
> 2.48.1
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/8] io_uring: add infra for importing vectored reg buffers
  2025-03-03 20:49   ` Caleb Sander Mateos
@ 2025-03-03 20:57     ` Keith Busch
  2025-03-04 10:05     ` Pavel Begunkov
  1 sibling, 0 replies; 25+ messages in thread
From: Keith Busch @ 2025-03-03 20:57 UTC (permalink / raw)
  To: Caleb Sander Mateos; +Cc: Pavel Begunkov, io-uring, Andres Freund

On Mon, Mar 03, 2025 at 12:49:38PM -0800, Caleb Sander Mateos wrote:
> > +               u64 buf_end;
> > +
> > +               if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end)))
> > +                       return -EFAULT;
> > +               if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
> > +                       return -EFAULT;
> > +
> > +               total_len += iov_len;
> > +               /* by using folio address it also accounts for bvec offset */
> > +               offset = buf_addr - folio_addr;
> > +               src_bvec = imu->bvec + (offset >> imu->folio_shift);
> > +               offset &= folio_mask;
> > +
> > +               for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
> > +                       size_t seg_size = min_t(size_t, iov_len,
> > +                                               folio_size - offset);
> > +
> > +                       res_bvec[bvec_idx].bv_page = src_bvec->bv_page;
> > +                       res_bvec[bvec_idx].bv_offset = offset;
> > +                       res_bvec[bvec_idx].bv_len = seg_size;
> 
> Could just increment res_bvec to avoid the variable bvec_idx?

And utilizing bvec_set_page() to initialize looks a bit cleaner too.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/8] Add support for vectored registered buffers
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (7 preceding siblings ...)
  2025-03-03 15:51 ` [PATCH 8/8] io_uring/net: implement vectored reg bufs for zctx Pavel Begunkov
@ 2025-03-03 21:03 ` Andres Freund
  2025-03-04 10:21   ` Pavel Begunkov
  2025-03-04  0:34 ` Caleb Sander Mateos
  9 siblings, 1 reply; 25+ messages in thread
From: Andres Freund @ 2025-03-03 21:03 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring

Hi,

On 2025-03-03 15:50:55 +0000, Pavel Begunkov wrote:
> Add registered buffer support for vectored io_uring operations. That
> allows to pass an iovec, all entries of which must belong to and
> point into the same registered buffer specified by sqe->buf_index.

This is very much appreciated!


> The series covers zerocopy sendmsg and reads / writes. Reads and
> writes are implemented as new opcodes, while zerocopy sendmsg
> reuses IORING_RECVSEND_FIXED_BUF for the api.
> 
> Results are aligned to what one would expect from registered buffers:
> 
> t/io_uring + nullblk, single segment 16K:
>   34 -> 46 GiB/s

FWIW, I'd expect bigger wins with real IO when using 1GB huge pages. I
encountered when there were a lot of reads from a large nvme raid into a small
set of shared huge pages (database buffer pool), by many proceses
concurrently. The constant pinning/unpinning of the relevant folio caused a
lot of contention.

Unfortunately switching to registered buffers would, until now, have required
using non-vectored IO, which causes significant performance regressions in
other cases...

Greetings,

Andres Freund

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/8] io_uring/rw: implement vectored registered rw
  2025-03-03 15:50 ` [PATCH 3/8] io_uring/rw: implement vectored registered rw Pavel Begunkov
@ 2025-03-03 23:01   ` Caleb Sander Mateos
  2025-03-03 23:37     ` Caleb Sander Mateos
  2025-03-04 10:09     ` Pavel Begunkov
  0 siblings, 2 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-03 23:01 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 7:50 AM Pavel Begunkov <[email protected]> wrote:
>
> Implement registered buffer vectored reads with new opcodes
> IORING_OP_WRITEV_FIXED and IORING_OP_READV_FIXED.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  include/uapi/linux/io_uring.h |  2 ++
>  io_uring/opdef.c              | 39 +++++++++++++++++++++++++++
>  io_uring/rw.c                 | 51 +++++++++++++++++++++++++++++++++++
>  io_uring/rw.h                 |  2 ++
>  4 files changed, 94 insertions(+)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 1e02e94bc26d..9dd384b369ee 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -280,6 +280,8 @@ enum io_uring_op {
>         IORING_OP_BIND,
>         IORING_OP_LISTEN,
>         IORING_OP_RECV_ZC,
> +       IORING_OP_READV_FIXED,
> +       IORING_OP_WRITEV_FIXED,
>
>         /* this goes last, obviously */
>         IORING_OP_LAST,
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 9511262c513e..6655d2cbf74d 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -529,6 +529,35 @@ const struct io_issue_def io_issue_defs[] = {
>                 .prep                   = io_eopnotsupp_prep,
>  #endif
>         },
> +       [IORING_OP_READV_FIXED] = {
> +               .needs_file             = 1,
> +               .unbound_nonreg_file    = 1,
> +               .pollin                 = 1,
> +               .plug                   = 1,
> +               .audit_skip             = 1,
> +               .ioprio                 = 1,
> +               .iopoll                 = 1,
> +               .iopoll_queue           = 1,
> +               .vectored               = 1,
> +               .async_size             = sizeof(struct io_async_rw),
> +               .prep                   = io_prep_readv_fixed,
> +               .issue                  = io_read,
> +       },
> +       [IORING_OP_WRITEV_FIXED] = {
> +               .needs_file             = 1,
> +               .hash_reg_file          = 1,
> +               .unbound_nonreg_file    = 1,
> +               .pollout                = 1,
> +               .plug                   = 1,
> +               .audit_skip             = 1,
> +               .ioprio                 = 1,
> +               .iopoll                 = 1,
> +               .iopoll_queue           = 1,
> +               .vectored               = 1,
> +               .async_size             = sizeof(struct io_async_rw),
> +               .prep                   = io_prep_writev_fixed,
> +               .issue                  = io_write,
> +       },
>  };
>
>  const struct io_cold_def io_cold_defs[] = {
> @@ -761,6 +790,16 @@ const struct io_cold_def io_cold_defs[] = {
>         [IORING_OP_RECV_ZC] = {
>                 .name                   = "RECV_ZC",
>         },
> +       [IORING_OP_READV_FIXED] = {
> +               .name                   = "READV_FIXED",
> +               .cleanup                = io_readv_writev_cleanup,
> +               .fail                   = io_rw_fail,
> +       },
> +       [IORING_OP_WRITEV_FIXED] = {
> +               .name                   = "WRITEV_FIXED",
> +               .cleanup                = io_readv_writev_cleanup,
> +               .fail                   = io_rw_fail,
> +       },
>  };
>
>  const char *io_uring_get_opcode(u8 opcode)
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index ad7f647d48e9..4c4229f41aaa 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -381,6 +381,57 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>         return __io_prep_rw(req, sqe, ITER_SOURCE);
>  }
>
> +static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
> +{
> +       struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +       struct io_async_rw *io = req->async_data;
> +       const struct iovec __user *uvec;
> +       size_t uvec_segs = rw->len;
> +       struct iovec *iov;
> +       int iovec_off, ret;
> +       void *res;
> +
> +       if (uvec_segs > io->vec.nr) {
> +               ret = io_vec_realloc(&io->vec, uvec_segs);
> +               if (ret)
> +                       return ret;
> +               req->flags |= REQ_F_NEED_CLEANUP;
> +       }
> +       /* pad iovec to the right */
> +       iovec_off = io->vec.nr - uvec_segs;
> +       iov = io->vec.iovec + iovec_off;
> +       uvec = u64_to_user_ptr(rw->addr);
> +       res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
> +                             io_is_compat(req->ctx));
> +       if (IS_ERR(res))
> +               return PTR_ERR(res);
> +
> +       ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
> +                               uvec_segs, iovec_off, 0);

So the iovecs are being imported at prep time rather than issue time?
I suppose since only user registered buffers are allowed and not
kernel bvecs, you aren't concerned about interactions with the ublk
bvec register/unregister operations? I think in principle the
difference between prep and issue time is still observable if the same
registered buffer index is being used alternately for user and kernel
registered buffers.

Best,
Caleb

> +       iov_iter_save_state(&io->iter, &io->iter_state);
> +       return ret;
> +}
> +
> +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +       int ret;
> +
> +       ret = __io_prep_rw(req, sqe, ITER_DEST);
> +       if (unlikely(ret))
> +               return ret;
> +       return io_rw_prep_reg_vec(req, ITER_DEST);
> +}
> +
> +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +       int ret;
> +
> +       ret = __io_prep_rw(req, sqe, ITER_SOURCE);
> +       if (unlikely(ret))
> +               return ret;
> +       return io_rw_prep_reg_vec(req, ITER_SOURCE);
> +}
> +
>  /*
>   * Multishot read is prepared just like a normal read/write request, only
>   * difference is that we set the MULTISHOT flag.
> diff --git a/io_uring/rw.h b/io_uring/rw.h
> index e86a3858f48b..475b6306a316 100644
> --- a/io_uring/rw.h
> +++ b/io_uring/rw.h
> @@ -34,6 +34,8 @@ struct io_async_rw {
>
>  int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>  int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
> +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
> +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>  int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>  int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>  int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
> --
> 2.48.1
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/8] io_uring/rw: implement vectored registered rw
  2025-03-03 23:01   ` Caleb Sander Mateos
@ 2025-03-03 23:37     ` Caleb Sander Mateos
  2025-03-04 10:09     ` Pavel Begunkov
  1 sibling, 0 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-03 23:37 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 3:01 PM Caleb Sander Mateos
<[email protected]> wrote:
>
> On Mon, Mar 3, 2025 at 7:50 AM Pavel Begunkov <[email protected]> wrote:
> >
> > Implement registered buffer vectored reads with new opcodes
> > IORING_OP_WRITEV_FIXED and IORING_OP_READV_FIXED.
> >
> > Signed-off-by: Pavel Begunkov <[email protected]>
> > ---
> >  include/uapi/linux/io_uring.h |  2 ++
> >  io_uring/opdef.c              | 39 +++++++++++++++++++++++++++
> >  io_uring/rw.c                 | 51 +++++++++++++++++++++++++++++++++++
> >  io_uring/rw.h                 |  2 ++
> >  4 files changed, 94 insertions(+)
> >
> > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > index 1e02e94bc26d..9dd384b369ee 100644
> > --- a/include/uapi/linux/io_uring.h
> > +++ b/include/uapi/linux/io_uring.h
> > @@ -280,6 +280,8 @@ enum io_uring_op {
> >         IORING_OP_BIND,
> >         IORING_OP_LISTEN,
> >         IORING_OP_RECV_ZC,
> > +       IORING_OP_READV_FIXED,
> > +       IORING_OP_WRITEV_FIXED,
> >
> >         /* this goes last, obviously */
> >         IORING_OP_LAST,
> > diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> > index 9511262c513e..6655d2cbf74d 100644
> > --- a/io_uring/opdef.c
> > +++ b/io_uring/opdef.c
> > @@ -529,6 +529,35 @@ const struct io_issue_def io_issue_defs[] = {
> >                 .prep                   = io_eopnotsupp_prep,
> >  #endif
> >         },
> > +       [IORING_OP_READV_FIXED] = {
> > +               .needs_file             = 1,
> > +               .unbound_nonreg_file    = 1,
> > +               .pollin                 = 1,
> > +               .plug                   = 1,
> > +               .audit_skip             = 1,
> > +               .ioprio                 = 1,
> > +               .iopoll                 = 1,
> > +               .iopoll_queue           = 1,
> > +               .vectored               = 1,
> > +               .async_size             = sizeof(struct io_async_rw),
> > +               .prep                   = io_prep_readv_fixed,
> > +               .issue                  = io_read,
> > +       },
> > +       [IORING_OP_WRITEV_FIXED] = {
> > +               .needs_file             = 1,
> > +               .hash_reg_file          = 1,
> > +               .unbound_nonreg_file    = 1,
> > +               .pollout                = 1,
> > +               .plug                   = 1,
> > +               .audit_skip             = 1,
> > +               .ioprio                 = 1,
> > +               .iopoll                 = 1,
> > +               .iopoll_queue           = 1,
> > +               .vectored               = 1,
> > +               .async_size             = sizeof(struct io_async_rw),
> > +               .prep                   = io_prep_writev_fixed,
> > +               .issue                  = io_write,
> > +       },
> >  };
> >
> >  const struct io_cold_def io_cold_defs[] = {
> > @@ -761,6 +790,16 @@ const struct io_cold_def io_cold_defs[] = {
> >         [IORING_OP_RECV_ZC] = {
> >                 .name                   = "RECV_ZC",
> >         },
> > +       [IORING_OP_READV_FIXED] = {
> > +               .name                   = "READV_FIXED",
> > +               .cleanup                = io_readv_writev_cleanup,
> > +               .fail                   = io_rw_fail,
> > +       },
> > +       [IORING_OP_WRITEV_FIXED] = {
> > +               .name                   = "WRITEV_FIXED",
> > +               .cleanup                = io_readv_writev_cleanup,
> > +               .fail                   = io_rw_fail,
> > +       },
> >  };
> >
> >  const char *io_uring_get_opcode(u8 opcode)
> > diff --git a/io_uring/rw.c b/io_uring/rw.c
> > index ad7f647d48e9..4c4229f41aaa 100644
> > --- a/io_uring/rw.c
> > +++ b/io_uring/rw.c
> > @@ -381,6 +381,57 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> >         return __io_prep_rw(req, sqe, ITER_SOURCE);
> >  }
> >
> > +static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
> > +{
> > +       struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> > +       struct io_async_rw *io = req->async_data;
> > +       const struct iovec __user *uvec;
> > +       size_t uvec_segs = rw->len;
> > +       struct iovec *iov;
> > +       int iovec_off, ret;
> > +       void *res;
> > +
> > +       if (uvec_segs > io->vec.nr) {
> > +               ret = io_vec_realloc(&io->vec, uvec_segs);
> > +               if (ret)
> > +                       return ret;
> > +               req->flags |= REQ_F_NEED_CLEANUP;
> > +       }
> > +       /* pad iovec to the right */
> > +       iovec_off = io->vec.nr - uvec_segs;
> > +       iov = io->vec.iovec + iovec_off;
> > +       uvec = u64_to_user_ptr(rw->addr);
> > +       res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
> > +                             io_is_compat(req->ctx));
> > +       if (IS_ERR(res))
> > +               return PTR_ERR(res);
> > +
> > +       ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
> > +                               uvec_segs, iovec_off, 0);
>
> So the iovecs are being imported at prep time rather than issue time?
> I suppose since only user registered buffers are allowed and not
> kernel bvecs, you aren't concerned about interactions with the ublk
> bvec register/unregister operations? I think in principle the
> difference between prep and issue time is still observable if the same
> registered buffer index is being used alternately for user and kernel
> registered buffers.

Never mind, I see you change this in the next patch.

Best,
Caleb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/8] io_uring/rw: defer reg buf vec import
  2025-03-03 15:50 ` [PATCH 4/8] io_uring/rw: defer reg buf vec import Pavel Begunkov
@ 2025-03-03 23:37   ` Caleb Sander Mateos
  0 siblings, 0 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-03 23:37 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 7:52 AM Pavel Begunkov <[email protected]> wrote:
>
> Import registered buffers for vectored reads and writes later at issue
> time as we now do for other fixed ops.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  include/linux/io_uring_types.h |  3 +++
>  io_uring/rw.c                  | 36 +++++++++++++++++++++++++++++-----
>  2 files changed, 34 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index b770a2b12da6..d36fccda754b 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -502,6 +502,7 @@ enum {
>         REQ_F_BUFFERS_COMMIT_BIT,
>         REQ_F_BUF_NODE_BIT,
>         REQ_F_HAS_METADATA_BIT,
> +       REQ_F_IMPORT_BUFFER_BIT,
>
>         /* not a real bit, just to check we're not overflowing the space */
>         __REQ_F_LAST_BIT,
> @@ -584,6 +585,8 @@ enum {
>         REQ_F_BUF_NODE          = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
>         /* request has read/write metadata assigned */
>         REQ_F_HAS_METADATA      = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
> +       /* resolve padded iovec to registered buffers */
> +       REQ_F_IMPORT_BUFFER     = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
>  };
>
>  typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 4c4229f41aaa..33a7ab2a8664 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -381,6 +381,24 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>         return __io_prep_rw(req, sqe, ITER_SOURCE);
>  }
>
> +static int io_rw_import_reg_vec(struct io_kiocb *req,
> +                               struct io_async_rw *io,
> +                               int ddir, unsigned int issue_flags)
> +{
> +       struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +       unsigned uvec_segs = rw->len;
> +       unsigned iovec_off = io->vec.nr - uvec_segs;
> +       int ret;
> +
> +       ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
> +                               uvec_segs, iovec_off, issue_flags);
> +       if (unlikely(ret))
> +               return ret;
> +       iov_iter_save_state(&io->iter, &io->iter_state);
> +       req->flags &= ~REQ_F_IMPORT_BUFFER;
> +       return 0;
> +}
> +
>  static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
>  {
>         struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> @@ -406,10 +424,8 @@ static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
>         if (IS_ERR(res))
>                 return PTR_ERR(res);
>
> -       ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
> -                               uvec_segs, iovec_off, 0);
> -       iov_iter_save_state(&io->iter, &io->iter_state);
> -       return ret;
> +       req->flags |= REQ_F_IMPORT_BUFFER;
> +       return 0;

Looks like ddir is now unused in this function?

Best,
Caleb


>  }
>
>  int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> @@ -906,7 +922,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
>         ssize_t ret;
>         loff_t *ppos;
>
> -       if (io_do_buffer_select(req)) {
> +       if (req->flags & REQ_F_IMPORT_BUFFER) {
> +               ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags);
> +               if (unlikely(ret))
> +                       return ret;
> +       } else if (io_do_buffer_select(req)) {
>                 ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags);
>                 if (unlikely(ret < 0))
>                         return ret;
> @@ -1117,6 +1137,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
>         ssize_t ret, ret2;
>         loff_t *ppos;
>
> +       if (req->flags & REQ_F_IMPORT_BUFFER) {
> +               ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags);
> +               if (unlikely(ret))
> +                       return ret;
> +       }
> +
>         ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
>         if (unlikely(ret))
>                 return ret;
> --
> 2.48.1
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 7/8] io_uring/net: convert to struct iou_vec
  2025-03-03 15:51 ` [PATCH 7/8] io_uring/net: convert to struct iou_vec Pavel Begunkov
@ 2025-03-03 23:37   ` Caleb Sander Mateos
  0 siblings, 0 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-03 23:37 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 7:50 AM Pavel Begunkov <[email protected]> wrote:
>
> Convert net.c to use struct iou_vec.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  io_uring/alloc_cache.h |  9 --------
>  io_uring/net.c         | 51 ++++++++++++++++++------------------------
>  io_uring/net.h         |  6 ++---
>  3 files changed, 25 insertions(+), 41 deletions(-)
>
> diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
> index 0dd17d8ba93a..7094d9d0bd29 100644
> --- a/io_uring/alloc_cache.h
> +++ b/io_uring/alloc_cache.h
> @@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,
>
>  void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);
>
> -static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
> -{
> -       if (IS_ENABLED(CONFIG_KASAN)) {
> -               kfree(*iov);
> -               *iov = NULL;
> -               *nr = 0;
> -       }
> -}
> -
>  static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
>                                       void *entry)
>  {
> diff --git a/io_uring/net.c b/io_uring/net.c
> index cbb889b85cfc..a4b39343f345 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags)
>
>  static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
>  {
> -       if (kmsg->free_iov) {
> -               kfree(kmsg->free_iov);
> -               kmsg->free_iov_nr = 0;
> -               kmsg->free_iov = NULL;
> -       }
> +       if (kmsg->vec.iovec)
> +               io_vec_free(&kmsg->vec);

io_vec_free() already checks vec.iovec, is it necessary to duplicate
it? If it's an unlikely case and you'd like to avoid the function call
overhead, then move io_vec_free() to the header file so it can be
inlined?

Best,
Caleb


>  }
>
>  static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
> @@ -154,7 +151,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
>         }
>
>         /* Let normal cleanup path reap it if we fail adding to the cache */
> -       io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr);
> +       io_alloc_cache_vec_kasan(&hdr->vec);
>         if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
>                 req->async_data = NULL;
>                 req->flags &= ~REQ_F_ASYNC_DATA;
> @@ -171,7 +168,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
>                 return NULL;
>
>         /* If the async data was cached, we might have an iov cached inside. */
> -       if (hdr->free_iov)
> +       if (hdr->vec.iovec)
>                 req->flags |= REQ_F_NEED_CLEANUP;
>         return hdr;
>  }
> @@ -182,10 +179,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg
>  {
>         if (iov) {
>                 req->flags |= REQ_F_NEED_CLEANUP;
> -               kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
> -               if (kmsg->free_iov)
> -                       kfree(kmsg->free_iov);
> -               kmsg->free_iov = iov;
> +               io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs);
>         }
>  }
>
> @@ -208,9 +202,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg
>         struct iovec *iov;
>         int ret, nr_segs;
>
> -       if (iomsg->free_iov) {
> -               nr_segs = iomsg->free_iov_nr;
> -               iov = iomsg->free_iov;
> +       if (iomsg->vec.iovec) {
> +               nr_segs = iomsg->vec.nr;
> +               iov = iomsg->vec.iovec;
>         } else {
>                 nr_segs = 1;
>                 iov = &iomsg->fast_iov;
> @@ -468,7 +462,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
>         if (iter_is_ubuf(&kmsg->msg.msg_iter))
>                 return 1;
>
> -       iov = kmsg->free_iov;
> +       iov = kmsg->vec.iovec;
>         if (!iov)
>                 iov = &kmsg->fast_iov;
>
> @@ -584,9 +578,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
>                 .nr_iovs = 1,
>         };
>
> -       if (kmsg->free_iov) {
> -               arg.nr_iovs = kmsg->free_iov_nr;
> -               arg.iovs = kmsg->free_iov;
> +       if (kmsg->vec.iovec) {
> +               arg.nr_iovs = kmsg->vec.nr;
> +               arg.iovs = kmsg->vec.iovec;
>                 arg.mode = KBUF_MODE_FREE;
>         }
>
> @@ -599,9 +593,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
>         if (unlikely(ret < 0))
>                 return ret;
>
> -       if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
> -               kmsg->free_iov_nr = ret;
> -               kmsg->free_iov = arg.iovs;
> +       if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
> +               kmsg->vec.nr = ret;
> +               kmsg->vec.iovec = arg.iovs;
>                 req->flags |= REQ_F_NEED_CLEANUP;
>         }
>         sr->len = arg.out_len;
> @@ -1085,9 +1079,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
>                         .mode = KBUF_MODE_EXPAND,
>                 };
>
> -               if (kmsg->free_iov) {
> -                       arg.nr_iovs = kmsg->free_iov_nr;
> -                       arg.iovs = kmsg->free_iov;
> +               if (kmsg->vec.iovec) {
> +                       arg.nr_iovs = kmsg->vec.nr;
> +                       arg.iovs = kmsg->vec.iovec;
>                         arg.mode |= KBUF_MODE_FREE;
>                 }
>
> @@ -1106,9 +1100,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
>                 }
>                 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
>                                 arg.out_len);
> -               if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
> -                       kmsg->free_iov_nr = ret;
> -                       kmsg->free_iov = arg.iovs;
> +               if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
> +                       kmsg->vec.nr = ret;
> +                       kmsg->vec.iovec = arg.iovs;
>                         req->flags |= REQ_F_NEED_CLEANUP;
>                 }
>         } else {
> @@ -1874,8 +1868,7 @@ void io_netmsg_cache_free(const void *entry)
>  {
>         struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
>
> -       if (kmsg->free_iov)
> -               io_netmsg_iovec_free(kmsg);
> +       io_vec_free(&kmsg->vec);
>         kfree(kmsg);
>  }
>  #endif
> diff --git a/io_uring/net.h b/io_uring/net.h
> index b804c2b36e60..43e5ce5416b7 100644
> --- a/io_uring/net.h
> +++ b/io_uring/net.h
> @@ -2,12 +2,12 @@
>
>  #include <linux/net.h>
>  #include <linux/uio.h>
> +#include <linux/io_uring_types.h>
>
>  struct io_async_msghdr {
>  #if defined(CONFIG_NET)
> -       struct iovec                    *free_iov;
> -       /* points to an allocated iov, if NULL we use fast_iov instead */
> -       int                             free_iov_nr;
> +       struct iou_vec                          vec;
> +
>         struct_group(clear,
>                 int                             namelen;
>                 struct iovec                    fast_iov;
> --
> 2.48.1
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/8] Add support for vectored registered buffers
  2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
                   ` (8 preceding siblings ...)
  2025-03-03 21:03 ` [PATCH 0/8] Add support for vectored registered buffers Andres Freund
@ 2025-03-04  0:34 ` Caleb Sander Mateos
  2025-03-04 10:26   ` Pavel Begunkov
  2025-03-04 12:12   ` Stefan Metzmacher
  9 siblings, 2 replies; 25+ messages in thread
From: Caleb Sander Mateos @ 2025-03-04  0:34 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: io-uring, Andres Freund

On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <[email protected]> wrote:
>
> Add registered buffer support for vectored io_uring operations. That
> allows to pass an iovec, all entries of which must belong to and
> point into the same registered buffer specified by sqe->buf_index.
>
> The series covers zerocopy sendmsg and reads / writes. Reads and
> writes are implemented as new opcodes, while zerocopy sendmsg
> reuses IORING_RECVSEND_FIXED_BUF for the api.
>
> Results are aligned to what one would expect from registered buffers:
>
> t/io_uring + nullblk, single segment 16K:
>   34 -> 46 GiB/s
> examples/send-zerocopy.c default send size (64KB):
>   82558 -> 123855 MB/s

Thanks for implementing this, it's great to be able to combine these 2
optimizations! Though I suspect many applications will want to perform
vectorized I/O using iovecs that come from different registered
buffers (e.g. separate header and data allocations). Perhaps a future
improvement could allow a list of buffer indices to be specified.

Thanks,
Caleb

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/8] io_uring: add infra for importing vectored reg buffers
  2025-03-03 20:49   ` Caleb Sander Mateos
  2025-03-03 20:57     ` Keith Busch
@ 2025-03-04 10:05     ` Pavel Begunkov
  2025-03-04 15:18       ` Pavel Begunkov
  1 sibling, 1 reply; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-04 10:05 UTC (permalink / raw)
  To: Caleb Sander Mateos; +Cc: io-uring, Andres Freund

On 3/3/25 20:49, Caleb Sander Mateos wrote:
> On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <[email protected]> wrote:
...
> 
> If I understand correctly, io_import_reg_vec() converts the iovecs to
> bio_vecs in place. If an iovec expands to more than one bio_vec (i.e.
> crosses a folio boundary), wouldn't the bio_vecs overwrite iovecs that
> hadn't been processed yet?

It's handled, obviously, you missed that the vectors are
offset'ed from each other.

> >> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
>> index 9b05e614819e..1ec1f5b3e385 100644
>> --- a/io_uring/rsrc.c
>> +++ b/io_uring/rsrc.c
...
>> +               for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
>> +                       size_t seg_size = min_t(size_t, iov_len,
>> +                                               folio_size - offset);
>> +
>> +                       res_bvec[bvec_idx].bv_page = src_bvec->bv_page;
>> +                       res_bvec[bvec_idx].bv_offset = offset;
>> +                       res_bvec[bvec_idx].bv_len = seg_size;
> 
> Could just increment res_bvec to avoid the variable bvec_idx?

I don't see the benefit.

>> +       for (i = 0; i < nr_iovs; i++)
>> +               max_segs += (iov[i].iov_len >> shift) + 2;
> 
> Sees like this may overestimate a bit. I think something like this
> would give the exact number of segments for each iovec?
> (((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift

It's overestimated exactly to avoid a beast like this.

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/8] io_uring/rw: implement vectored registered rw
  2025-03-03 23:01   ` Caleb Sander Mateos
  2025-03-03 23:37     ` Caleb Sander Mateos
@ 2025-03-04 10:09     ` Pavel Begunkov
  1 sibling, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-04 10:09 UTC (permalink / raw)
  To: Caleb Sander Mateos; +Cc: io-uring, Andres Freund

On 3/3/25 23:01, Caleb Sander Mateos wrote:
...
>> +       /* pad iovec to the right */
>> +       iovec_off = io->vec.nr - uvec_segs;
>> +       iov = io->vec.iovec + iovec_off;
>> +       uvec = u64_to_user_ptr(rw->addr);
>> +       res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
>> +                             io_is_compat(req->ctx));
>> +       if (IS_ERR(res))
>> +               return PTR_ERR(res);
>> +
>> +       ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
>> +                               uvec_segs, iovec_off, 0);
> 
> So the iovecs are being imported at prep time rather than issue time?
> I suppose since only user registered buffers are allowed and not
> kernel bvecs, you aren't concerned about interactions with the ublk

It's a question of generic io_uring policy and has nothing
to do with ublk. Thinking about it in terms of a specific
user wouldn't be productive.

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/8] Add support for vectored registered buffers
  2025-03-03 21:03 ` [PATCH 0/8] Add support for vectored registered buffers Andres Freund
@ 2025-03-04 10:21   ` Pavel Begunkov
  0 siblings, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-04 10:21 UTC (permalink / raw)
  To: Andres Freund; +Cc: io-uring

On 3/3/25 21:03, Andres Freund wrote:
> Hi,
> 
> On 2025-03-03 15:50:55 +0000, Pavel Begunkov wrote:
>> Add registered buffer support for vectored io_uring operations. That
>> allows to pass an iovec, all entries of which must belong to and
>> point into the same registered buffer specified by sqe->buf_index.
> 
> This is very much appreciated!'

Glad to hear. I do remember you mentioning the contention issue
in the list. A bunch of other people who were interested as well.

>> The series covers zerocopy sendmsg and reads / writes. Reads and
>> writes are implemented as new opcodes, while zerocopy sendmsg
>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>
>> Results are aligned to what one would expect from registered buffers:
>>
>> t/io_uring + nullblk, single segment 16K:
>>    34 -> 46 GiB/s
> 
> FWIW, I'd expect bigger wins with real IO when using 1GB huge pages. I

I didn't even benchmark it meaningfully as we should be able to
extrapolate results from registered buffer test, but I agree, such
contention might make it even more desirable.


> encountered when there were a lot of reads from a large nvme raid into a small
> set of shared huge pages (database buffer pool), by many proceses
> concurrently. The constant pinning/unpinning of the relevant folio caused a
> lot of contention.
> 
> Unfortunately switching to registered buffers would, until now, have required
> using non-vectored IO, which causes significant performance regressions in
> other cases...

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/8] Add support for vectored registered buffers
  2025-03-04  0:34 ` Caleb Sander Mateos
@ 2025-03-04 10:26   ` Pavel Begunkov
  2025-03-04 12:12   ` Stefan Metzmacher
  1 sibling, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-04 10:26 UTC (permalink / raw)
  To: Caleb Sander Mateos; +Cc: io-uring, Andres Freund

On 3/4/25 00:34, Caleb Sander Mateos wrote:
> On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <[email protected]> wrote:
>>
>> Add registered buffer support for vectored io_uring operations. That
>> allows to pass an iovec, all entries of which must belong to and
>> point into the same registered buffer specified by sqe->buf_index.
>>
>> The series covers zerocopy sendmsg and reads / writes. Reads and
>> writes are implemented as new opcodes, while zerocopy sendmsg
>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>
>> Results are aligned to what one would expect from registered buffers:
>>
>> t/io_uring + nullblk, single segment 16K:
>>    34 -> 46 GiB/s
>> examples/send-zerocopy.c default send size (64KB):
>>    82558 -> 123855 MB/s
> 
> Thanks for implementing this, it's great to be able to combine these 2
> optimizations! Though I suspect many applications will want to perform
> vectorized I/O using iovecs that come from different registered
> buffers (e.g. separate header and data allocations). Perhaps a future
> improvement could allow a list of buffer indices to be specified.

That's the design decision made, otherwise the API is becoming a mess
as well as handling. The user has to be smart and keep a small number
of large registered buffers and potentially growing them, which is a
good thing regardless of this feature.

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/8] Add support for vectored registered buffers
  2025-03-04  0:34 ` Caleb Sander Mateos
  2025-03-04 10:26   ` Pavel Begunkov
@ 2025-03-04 12:12   ` Stefan Metzmacher
  2025-03-04 14:20     ` Pavel Begunkov
  1 sibling, 1 reply; 25+ messages in thread
From: Stefan Metzmacher @ 2025-03-04 12:12 UTC (permalink / raw)
  To: Caleb Sander Mateos, Pavel Begunkov; +Cc: io-uring, Andres Freund

Hi Pavel,

>> Add registered buffer support for vectored io_uring operations. That
>> allows to pass an iovec, all entries of which must belong to and
>> point into the same registered buffer specified by sqe->buf_index.
>>
>> The series covers zerocopy sendmsg and reads / writes. Reads and
>> writes are implemented as new opcodes, while zerocopy sendmsg
>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>
>> Results are aligned to what one would expect from registered buffers:
>>
>> t/io_uring + nullblk, single segment 16K:
>>    34 -> 46 GiB/s
>> examples/send-zerocopy.c default send size (64KB):
>>    82558 -> 123855 MB/s
> 
> Thanks for implementing this, it's great to be able to combine these 2
> optimizations! Though I suspect many applications will want to perform
> vectorized I/O using iovecs that come from different registered
> buffers (e.g. separate header and data allocations). Perhaps a future
> improvement could allow a list of buffer indices to be specified.

I'm wondering about the same. And it's not completely
clear to me what the value of iov_base is in this case,
is it the offset into the buffer, or the real pointer address
that must within the range of the registered buffer?

It might also be very useful to have some vector elements pointing
into one of the registered buffer, while others refer to non-registered
buffers.

metze




^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/8] Add support for vectored registered buffers
  2025-03-04 12:12   ` Stefan Metzmacher
@ 2025-03-04 14:20     ` Pavel Begunkov
  0 siblings, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-04 14:20 UTC (permalink / raw)
  To: Stefan Metzmacher, Caleb Sander Mateos; +Cc: io-uring, Andres Freund

On 3/4/25 12:12, Stefan Metzmacher wrote:
> Hi Pavel,
> 
>>> Add registered buffer support for vectored io_uring operations. That
>>> allows to pass an iovec, all entries of which must belong to and
>>> point into the same registered buffer specified by sqe->buf_index.
>>>
>>> The series covers zerocopy sendmsg and reads / writes. Reads and
>>> writes are implemented as new opcodes, while zerocopy sendmsg
>>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>>
>>> Results are aligned to what one would expect from registered buffers:
>>>
>>> t/io_uring + nullblk, single segment 16K:
>>>    34 -> 46 GiB/s
>>> examples/send-zerocopy.c default send size (64KB):
>>>    82558 -> 123855 MB/s
>>
>> Thanks for implementing this, it's great to be able to combine these 2
>> optimizations! Though I suspect many applications will want to perform
>> vectorized I/O using iovecs that come from different registered
>> buffers (e.g. separate header and data allocations). Perhaps a future
>> improvement could allow a list of buffer indices to be specified.
> 
> I'm wondering about the same. And it's not completely
> clear to me what the value of iov_base is in this case,
> is it the offset into the buffer, or the real pointer address
> that must within the range of the registered buffer?

Same as with other registered buffer requests. It's a pointer into
the initial buffer you specified when registering it, which serves
to calculate the offset.

See the io_uring_register(2) man, addr and len are iov_base and
iov_len and there are multiple of them. You can call it confusing,
and I'd agree, but that's how it was done from the very beginning,
so staying consistent here.

https://github.com/axboe/liburing/blob/master/man/io_uring_register.2#L87

  
> It might also be very useful to have some vector elements pointing
> into one of the registered buffer, while others refer to non-registered
> buffers.

See the other reply.

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/8] io_uring: add infra for importing vectored reg buffers
  2025-03-04 10:05     ` Pavel Begunkov
@ 2025-03-04 15:18       ` Pavel Begunkov
  0 siblings, 0 replies; 25+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:18 UTC (permalink / raw)
  To: Caleb Sander Mateos; +Cc: io-uring, Andres Freund

On 3/4/25 10:05, Pavel Begunkov wrote:
> On 3/3/25 20:49, Caleb Sander Mateos wrote:
>> On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <[email protected]> wrote:
> ...
>>> +       for (i = 0; i < nr_iovs; i++)
>>> +               max_segs += (iov[i].iov_len >> shift) + 2;
>>
>> Sees like this may overestimate a bit. I think something like this
>> would give the exact number of segments for each iovec?
>> (((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift
> 
> It's overestimated exactly to avoid a beast like this.

And it'd be broken as well for 0 len.

-- 
Pavel Begunkov


^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2025-03-04 15:17 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-03 15:50 [PATCH 0/8] Add support for vectored registered buffers Pavel Begunkov
2025-03-03 15:50 ` [PATCH 1/8] io_uring: introduce struct iou_vec Pavel Begunkov
2025-03-03 18:23   ` Caleb Sander Mateos
2025-03-03 15:50 ` [PATCH 2/8] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
2025-03-03 20:49   ` Caleb Sander Mateos
2025-03-03 20:57     ` Keith Busch
2025-03-04 10:05     ` Pavel Begunkov
2025-03-04 15:18       ` Pavel Begunkov
2025-03-03 15:50 ` [PATCH 3/8] io_uring/rw: implement vectored registered rw Pavel Begunkov
2025-03-03 23:01   ` Caleb Sander Mateos
2025-03-03 23:37     ` Caleb Sander Mateos
2025-03-04 10:09     ` Pavel Begunkov
2025-03-03 15:50 ` [PATCH 4/8] io_uring/rw: defer reg buf vec import Pavel Begunkov
2025-03-03 23:37   ` Caleb Sander Mateos
2025-03-03 15:51 ` [PATCH 5/8] io_uring/net: combine msghdr copy Pavel Begunkov
2025-03-03 15:51 ` [PATCH 6/8] io_uring/net: pull vec alloc out of msghdr import Pavel Begunkov
2025-03-03 15:51 ` [PATCH 7/8] io_uring/net: convert to struct iou_vec Pavel Begunkov
2025-03-03 23:37   ` Caleb Sander Mateos
2025-03-03 15:51 ` [PATCH 8/8] io_uring/net: implement vectored reg bufs for zctx Pavel Begunkov
2025-03-03 21:03 ` [PATCH 0/8] Add support for vectored registered buffers Andres Freund
2025-03-04 10:21   ` Pavel Begunkov
2025-03-04  0:34 ` Caleb Sander Mateos
2025-03-04 10:26   ` Pavel Begunkov
2025-03-04 12:12   ` Stefan Metzmacher
2025-03-04 14:20     ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox