* [PATCH v2 0/9] Add support for vectored registered buffers
@ 2025-03-04 15:40 Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 1/9] io_uring: introduce struct iou_vec Pavel Begunkov
` (9 more replies)
0 siblings, 10 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Add registered buffer support for vectored io_uring operations. That
allows to pass an iovec, all entries of which must belong to and
point into the same registered buffer specified by sqe->buf_index.
The series covers zerocopy sendmsg and reads / writes. Reads and
writes are implemented as new opcodes, while zerocopy sendmsg
reuses IORING_RECVSEND_FIXED_BUF for the api.
Results are aligned to what one would expect from registered buffers:
t/io_uring + nullblk, single segment 16K:
34 -> 46 GiB/s
examples/send-zerocopy.c default send size (64KB):
82558 -> 123855 MB/s
The series is placed on top of 6.15 + zcrx.
liburing + tests:
https://github.com/isilence/liburing.git regbuf-import
v2:
Nowarn alloc
Cap bvec caching
Check length overflow
Reject 0 len segments
Other minor changes
Pavel Begunkov (9):
io_uring: introduce struct iou_vec
io_uring: add infra for importing vectored reg buffers
io_uring/rw: implement vectored registered rw
io_uring/rw: defer reg buf vec import
io_uring/net: combine msghdr copy
io_uring/net: pull vec alloc out of msghdr import
io_uring/net: convert to struct iou_vec
io_uring/net: implement vectored reg bufs for zctx
io_uring: cap cached iovec/bvec size
include/linux/io_uring_types.h | 11 ++
include/uapi/linux/io_uring.h | 2 +
io_uring/alloc_cache.h | 9 --
io_uring/net.c | 180 +++++++++++++++++++++------------
io_uring/net.h | 6 +-
io_uring/opdef.c | 39 +++++++
io_uring/rsrc.c | 131 ++++++++++++++++++++++++
io_uring/rsrc.h | 24 +++++
io_uring/rw.c | 99 ++++++++++++++++--
io_uring/rw.h | 6 +-
10 files changed, 415 insertions(+), 92 deletions(-)
--
2.48.1
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH v2 1/9] io_uring: introduce struct iou_vec
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
` (8 subsequent siblings)
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
I need a convenient way to pass around and work with iovec+size pair,
put them into a structure and makes use of it in rw.c
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 5 +++++
io_uring/rsrc.c | 9 +++++++++
io_uring/rsrc.h | 17 +++++++++++++++++
io_uring/rw.c | 17 +++++++----------
io_uring/rw.h | 4 ++--
5 files changed, 40 insertions(+), 12 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 35fc241c4672..9101f12d21ef 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -110,6 +110,11 @@ struct io_uring_task {
} ____cacheline_aligned_in_smp;
};
+struct iou_vec {
+ struct iovec *iovec;
+ unsigned nr;
+};
+
struct io_uring {
u32 head;
u32 tail;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d6ac41840900..9b05e614819e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1264,3 +1264,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
fput(file);
return ret;
}
+
+void io_vec_free(struct iou_vec *iv)
+{
+ if (!iv->iovec)
+ return;
+ kfree(iv->iovec);
+ iv->iovec = NULL;
+ iv->nr = 0;
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 284e300e63fb..ff78ead6bc75 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -4,6 +4,7 @@
#include <linux/io_uring_types.h>
#include <linux/lockdep.h>
+#include <linux/io_uring_types.h>
enum {
IORING_RSRC_FILE = 0,
@@ -145,4 +146,20 @@ static inline void __io_unaccount_mem(struct user_struct *user,
atomic_long_sub(nr_pages, &user->locked_vm);
}
+void io_vec_free(struct iou_vec *iv);
+
+static inline void io_vec_reset_iovec(struct iou_vec *iv,
+ struct iovec *iovec, unsigned nr)
+{
+ io_vec_free(iv);
+ iv->iovec = iovec;
+ iv->nr = nr;
+}
+
+static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv)
+{
+ if (IS_ENABLED(CONFIG_KASAN))
+ io_vec_free(iv);
+}
+
#endif
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 5ee9f8949e8b..ad7f647d48e9 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
int ret, nr_segs;
struct iovec *iov;
- if (io->free_iovec) {
- nr_segs = io->free_iov_nr;
- iov = io->free_iovec;
+ if (io->vec.iovec) {
+ nr_segs = io->vec.nr;
+ iov = io->vec.iovec;
} else {
nr_segs = 1;
iov = &io->fast_iov;
@@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
return ret;
if (iov) {
req->flags |= REQ_F_NEED_CLEANUP;
- io->free_iov_nr = io->iter.nr_segs;
- kfree(io->free_iovec);
- io->free_iovec = iov;
+ io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs);
}
return 0;
}
@@ -151,7 +149,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
return;
- io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr);
+ io_alloc_cache_vec_kasan(&rw->vec);
if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
@@ -201,7 +199,7 @@ static int io_rw_alloc_async(struct io_kiocb *req)
rw = io_uring_alloc_async_data(&ctx->rw_cache, req);
if (!rw)
return -ENOMEM;
- if (rw->free_iovec)
+ if (rw->vec.iovec)
req->flags |= REQ_F_NEED_CLEANUP;
rw->bytes_done = 0;
return 0;
@@ -1327,7 +1325,6 @@ void io_rw_cache_free(const void *entry)
{
struct io_async_rw *rw = (struct io_async_rw *) entry;
- if (rw->free_iovec)
- kfree(rw->free_iovec);
+ io_vec_free(&rw->vec);
kfree(rw);
}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index bf121b81ebe8..529fd2f96a7f 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -9,13 +9,13 @@ struct io_meta_state {
};
struct io_async_rw {
+ struct iou_vec vec;
size_t bytes_done;
- struct iovec *free_iovec;
+
struct_group(clear,
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
- int free_iov_nr;
/*
* wpq is for buffered io, while meta fields are used with
* direct io
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 1/9] io_uring: introduce struct iou_vec Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-07 14:07 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 3/9] io_uring/rw: implement vectored registered rw Pavel Begunkov
` (7 subsequent siblings)
9 siblings, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Add io_import_reg_vec(), which will be responsible for importing
vectored registered buffers. iovecs are overlapped with the resulting
bvec in memory, which is why the iovec is expected to be padded in
iou_vec.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 5 +-
io_uring/rsrc.c | 122 +++++++++++++++++++++++++++++++++
io_uring/rsrc.h | 5 ++
3 files changed, 131 insertions(+), 1 deletion(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9101f12d21ef..b770a2b12da6 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -111,7 +111,10 @@ struct io_uring_task {
};
struct iou_vec {
- struct iovec *iovec;
+ union {
+ struct iovec *iovec;
+ struct bio_vec *bvec;
+ };
unsigned nr;
};
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 9b05e614819e..38743886bbf4 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1267,9 +1267,131 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
void io_vec_free(struct iou_vec *iv)
{
+ BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec));
+
if (!iv->iovec)
return;
kfree(iv->iovec);
iv->iovec = NULL;
iv->nr = 0;
}
+
+int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
+{
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ struct iovec *iov;
+
+ iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
+ if (!iov)
+ return -ENOMEM;
+
+ io_vec_free(iv);
+ iv->iovec = iov;
+ iv->nr = nr_entries;
+ return 0;
+}
+
+static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
+ struct io_mapped_ubuf *imu,
+ struct iovec *iovec, unsigned nr_iovs,
+ struct iou_vec *vec)
+{
+ unsigned long folio_size = 1 << imu->folio_shift;
+ unsigned long folio_mask = folio_size - 1;
+ u64 folio_addr = imu->ubuf & ~folio_mask;
+ struct bio_vec *res_bvec = vec->bvec;
+ size_t total_len = 0;
+ unsigned bvec_idx = 0;
+ unsigned iov_idx;
+
+ for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
+ size_t iov_len = iovec[iov_idx].iov_len;
+ u64 buf_addr = (u64)iovec[iov_idx].iov_base;
+ struct bio_vec *src_bvec;
+ size_t offset;
+ u64 buf_end;
+
+ if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end)))
+ return -EFAULT;
+ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
+ return -EFAULT;
+ if (unlikely(!iov_len))
+ return -EFAULT;
+ if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
+ return -EOVERFLOW;
+
+ /* by using folio address it also accounts for bvec offset */
+ offset = buf_addr - folio_addr;
+ src_bvec = imu->bvec + (offset >> imu->folio_shift);
+ offset &= folio_mask;
+
+ for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
+ size_t seg_size = min_t(size_t, iov_len,
+ folio_size - offset);
+
+ bvec_set_page(&res_bvec[bvec_idx],
+ src_bvec->bv_page, seg_size, offset);
+ iov_len -= seg_size;
+ }
+ }
+ if (total_len > MAX_RW_COUNT)
+ return -EINVAL;
+
+ iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
+ return 0;
+}
+
+static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
+ struct io_mapped_ubuf *imu)
+{
+ unsigned shift = imu->folio_shift;
+ size_t max_segs = 0;
+ unsigned i;
+
+ for (i = 0; i < nr_iovs; i++)
+ max_segs += (iov[i].iov_len >> shift) + 2;
+ return max_segs;
+}
+
+int io_import_reg_vec(int ddir, struct iov_iter *iter,
+ struct io_kiocb *req, struct iou_vec *vec,
+ unsigned nr_iovs, unsigned iovec_off,
+ unsigned issue_flags)
+{
+ struct io_rsrc_node *node;
+ struct io_mapped_ubuf *imu;
+ struct iovec *iov;
+ unsigned nr_segs;
+
+ node = io_find_buf_node(req, issue_flags);
+ if (!node)
+ return -EFAULT;
+ imu = node->buf;
+ if (imu->is_kbuf)
+ return -EOPNOTSUPP;
+ if (!(imu->dir & (1 << ddir)))
+ return -EFAULT;
+
+ iov = vec->iovec + iovec_off;
+ nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
+
+ if (WARN_ON_ONCE(iovec_off + nr_iovs != vec->nr) ||
+ nr_segs > vec->nr) {
+ struct iou_vec tmp_vec = {};
+ int ret;
+
+ ret = io_vec_realloc(&tmp_vec, nr_segs);
+ if (ret)
+ return ret;
+
+ iovec_off = tmp_vec.nr - nr_iovs;
+ memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
+ io_vec_free(vec);
+
+ *vec = tmp_vec;
+ iov = vec->iovec + iovec_off;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+
+ return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index ff78ead6bc75..f1496f7d844f 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -62,6 +62,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
u64 buf_addr, size_t len, int ddir,
unsigned issue_flags);
+int io_import_reg_vec(int ddir, struct iov_iter *iter,
+ struct io_kiocb *req, struct iou_vec *vec,
+ unsigned nr_iovs, unsigned iovec_off,
+ unsigned issue_flags);
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
@@ -147,6 +151,7 @@ static inline void __io_unaccount_mem(struct user_struct *user,
}
void io_vec_free(struct iou_vec *iv);
+int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries);
static inline void io_vec_reset_iovec(struct iou_vec *iv,
struct iovec *iovec, unsigned nr)
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 3/9] io_uring/rw: implement vectored registered rw
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 1/9] io_uring: introduce struct iou_vec Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 4/9] io_uring/rw: defer reg buf vec import Pavel Begunkov
` (6 subsequent siblings)
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Implement registered buffer vectored reads with new opcodes
IORING_OP_WRITEV_FIXED and IORING_OP_READV_FIXED.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/uapi/linux/io_uring.h | 2 ++
io_uring/opdef.c | 39 +++++++++++++++++++++++++++
io_uring/rw.c | 51 +++++++++++++++++++++++++++++++++++
io_uring/rw.h | 2 ++
4 files changed, 94 insertions(+)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1e02e94bc26d..9dd384b369ee 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -280,6 +280,8 @@ enum io_uring_op {
IORING_OP_BIND,
IORING_OP_LISTEN,
IORING_OP_RECV_ZC,
+ IORING_OP_READV_FIXED,
+ IORING_OP_WRITEV_FIXED,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9511262c513e..6655d2cbf74d 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -529,6 +529,35 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READV_FIXED] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .vectored = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_readv_fixed,
+ .issue = io_read,
+ },
+ [IORING_OP_WRITEV_FIXED] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollout = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .vectored = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_writev_fixed,
+ .issue = io_write,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -761,6 +790,16 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_RECV_ZC] = {
.name = "RECV_ZC",
},
+ [IORING_OP_READV_FIXED] = {
+ .name = "READV_FIXED",
+ .cleanup = io_readv_writev_cleanup,
+ .fail = io_rw_fail,
+ },
+ [IORING_OP_WRITEV_FIXED] = {
+ .name = "WRITEV_FIXED",
+ .cleanup = io_readv_writev_cleanup,
+ .fail = io_rw_fail,
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index ad7f647d48e9..4c4229f41aaa 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -381,6 +381,57 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_prep_rw(req, sqe, ITER_SOURCE);
}
+static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io = req->async_data;
+ const struct iovec __user *uvec;
+ size_t uvec_segs = rw->len;
+ struct iovec *iov;
+ int iovec_off, ret;
+ void *res;
+
+ if (uvec_segs > io->vec.nr) {
+ ret = io_vec_realloc(&io->vec, uvec_segs);
+ if (ret)
+ return ret;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ /* pad iovec to the right */
+ iovec_off = io->vec.nr - uvec_segs;
+ iov = io->vec.iovec + iovec_off;
+ uvec = u64_to_user_ptr(rw->addr);
+ res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
+ io_is_compat(req->ctx));
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
+ uvec_segs, iovec_off, 0);
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ return ret;
+}
+
+int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = __io_prep_rw(req, sqe, ITER_DEST);
+ if (unlikely(ret))
+ return ret;
+ return io_rw_prep_reg_vec(req, ITER_DEST);
+}
+
+int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = __io_prep_rw(req, sqe, ITER_SOURCE);
+ if (unlikely(ret))
+ return ret;
+ return io_rw_prep_reg_vec(req, ITER_SOURCE);
+}
+
/*
* Multishot read is prepared just like a normal read/write request, only
* difference is that we set the MULTISHOT flag.
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 529fd2f96a7f..81d6d9a8cf69 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -32,6 +32,8 @@ struct io_async_rw {
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 4/9] io_uring/rw: defer reg buf vec import
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (2 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 3/9] io_uring/rw: implement vectored registered rw Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 5/9] io_uring/net: combine msghdr copy Pavel Begunkov
` (5 subsequent siblings)
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Import registered buffers for vectored reads and writes later at issue
time as we now do for other fixed ops.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/io_uring_types.h | 3 +++
io_uring/rw.c | 42 +++++++++++++++++++++++++++-------
2 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b770a2b12da6..d36fccda754b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -502,6 +502,7 @@ enum {
REQ_F_BUFFERS_COMMIT_BIT,
REQ_F_BUF_NODE_BIT,
REQ_F_HAS_METADATA_BIT,
+ REQ_F_IMPORT_BUFFER_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -584,6 +585,8 @@ enum {
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
/* request has read/write metadata assigned */
REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
+ /* resolve padded iovec to registered buffers */
+ REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c4229f41aaa..e62f4ce34171 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -381,7 +381,25 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_prep_rw(req, sqe, ITER_SOURCE);
}
-static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
+static int io_rw_import_reg_vec(struct io_kiocb *req,
+ struct io_async_rw *io,
+ int ddir, unsigned int issue_flags)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ unsigned uvec_segs = rw->len;
+ unsigned iovec_off = io->vec.nr - uvec_segs;
+ int ret;
+
+ ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
+ uvec_segs, iovec_off, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ req->flags &= ~REQ_F_IMPORT_BUFFER;
+ return 0;
+}
+
+static int io_rw_prep_reg_vec(struct io_kiocb *req)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_async_rw *io = req->async_data;
@@ -406,10 +424,8 @@ static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
- uvec_segs, iovec_off, 0);
- iov_iter_save_state(&io->iter, &io->iter_state);
- return ret;
+ req->flags |= REQ_F_IMPORT_BUFFER;
+ return 0;
}
int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -419,7 +435,7 @@ int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ret = __io_prep_rw(req, sqe, ITER_DEST);
if (unlikely(ret))
return ret;
- return io_rw_prep_reg_vec(req, ITER_DEST);
+ return io_rw_prep_reg_vec(req);
}
int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -429,7 +445,7 @@ int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ret = __io_prep_rw(req, sqe, ITER_SOURCE);
if (unlikely(ret))
return ret;
- return io_rw_prep_reg_vec(req, ITER_SOURCE);
+ return io_rw_prep_reg_vec(req);
}
/*
@@ -906,7 +922,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret;
loff_t *ppos;
- if (io_do_buffer_select(req)) {
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ } else if (io_do_buffer_select(req)) {
ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags);
if (unlikely(ret < 0))
return ret;
@@ -1117,6 +1137,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret, ret2;
loff_t *ppos;
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ }
+
ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
if (unlikely(ret))
return ret;
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 5/9] io_uring/net: combine msghdr copy
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (3 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 4/9] io_uring/rw: defer reg buf vec import Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 6/9] io_uring/net: pull vec alloc out of msghdr import Pavel Begunkov
` (4 subsequent siblings)
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Call the compat version from inside of io_msg_copy_hdr() and don't
duplicate it in callers.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/net.c | 46 +++++++++++++++++++---------------------------
1 file changed, 19 insertions(+), 27 deletions(-)
diff --git a/io_uring/net.c b/io_uring/net.c
index 905d1ee01201..33076bd22c16 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -287,6 +287,24 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
struct user_msghdr __user *umsg = sr->umsg;
int ret;
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->msg.msg_iter.nr_segs = 0;
+
+ if (io_is_compat(req->ctx)) {
+ struct compat_msghdr cmsg;
+
+ ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
+ if (ret)
+ return ret;
+
+ memset(&msg, 0, sizeof(msg));
+ msg->msg_namelen = cmsg.msg_namelen;
+ msg->msg_controllen = cmsg.msg_controllen;
+ msg->msg_iov = compat_ptr(cmsg.msg_iov);
+ msg->msg_iovlen = cmsg.msg_iovlen;
+ return 0;
+ }
+
ret = io_copy_msghdr_from_user(msg, umsg);
if (unlikely(ret))
return ret;
@@ -323,18 +341,6 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct user_msghdr msg;
int ret;
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->msg.msg_iter.nr_segs = 0;
-
- if (io_is_compat(req->ctx)) {
- struct compat_msghdr cmsg;
-
- ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE,
- NULL);
- sr->msg_control = iomsg->msg.msg_control_user;
- return ret;
- }
-
ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
/* save msg_control as sys_sendmsg() overwrites it */
sr->msg_control = iomsg->msg.msg_control_user;
@@ -710,21 +716,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
struct user_msghdr msg;
int ret;
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->msg.msg_iter.nr_segs = 0;
-
- if (io_is_compat(req->ctx)) {
- struct compat_msghdr cmsg;
-
- ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST,
- &iomsg->uaddr);
- memset(&msg, 0, sizeof(msg));
- msg.msg_namelen = cmsg.msg_namelen;
- msg.msg_controllen = cmsg.msg_controllen;
- } else {
- ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
- }
-
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
if (unlikely(ret))
return ret;
return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 6/9] io_uring/net: pull vec alloc out of msghdr import
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (4 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 5/9] io_uring/net: combine msghdr copy Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 7/9] io_uring/net: convert to struct iou_vec Pavel Begunkov
` (3 subsequent siblings)
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
I'll need more control over iovec management, move
io_net_import_vec() out of io_msg_copy_hdr().
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/net.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/io_uring/net.c b/io_uring/net.c
index 33076bd22c16..cbb889b85cfc 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -253,12 +253,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
return -EFAULT;
sr->len = tmp_iov.iov_len;
}
-
- return 0;
}
-
- return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov,
- msg->msg_iovlen, ddir);
+ return 0;
}
static int io_copy_msghdr_from_user(struct user_msghdr *msg,
@@ -328,10 +324,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
return -EFAULT;
sr->len = tmp_iov.iov_len;
}
- return 0;
}
-
- return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir);
+ return 0;
}
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
@@ -342,6 +336,12 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
int ret;
ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
+ if (unlikely(ret))
+ return ret;
+
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
+ ITER_SOURCE);
/* save msg_control as sys_sendmsg() overwrites it */
sr->msg_control = iomsg->msg.msg_control_user;
return ret;
@@ -719,6 +719,13 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
if (unlikely(ret))
return ret;
+
+ if (!(req->flags & REQ_F_BUFFER_SELECT)) {
+ ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
+ ITER_DEST);
+ if (unlikely(ret))
+ return ret;
+ }
return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
msg.msg_controllen);
}
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 7/9] io_uring/net: convert to struct iou_vec
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (5 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 6/9] io_uring/net: pull vec alloc out of msghdr import Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 8/9] io_uring/net: implement vectored reg bufs for zctx Pavel Begunkov
` (2 subsequent siblings)
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Convert net.c to use struct iou_vec.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/alloc_cache.h | 9 --------
io_uring/net.c | 51 ++++++++++++++++++------------------------
io_uring/net.h | 6 ++---
3 files changed, 25 insertions(+), 41 deletions(-)
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 0dd17d8ba93a..7094d9d0bd29 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,
void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);
-static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
-{
- if (IS_ENABLED(CONFIG_KASAN)) {
- kfree(*iov);
- *iov = NULL;
- *nr = 0;
- }
-}
-
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
void *entry)
{
diff --git a/io_uring/net.c b/io_uring/net.c
index cbb889b85cfc..a4b39343f345 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags)
static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
{
- if (kmsg->free_iov) {
- kfree(kmsg->free_iov);
- kmsg->free_iov_nr = 0;
- kmsg->free_iov = NULL;
- }
+ if (kmsg->vec.iovec)
+ io_vec_free(&kmsg->vec);
}
static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
@@ -154,7 +151,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
}
/* Let normal cleanup path reap it if we fail adding to the cache */
- io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr);
+ io_alloc_cache_vec_kasan(&hdr->vec);
if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
@@ -171,7 +168,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
return NULL;
/* If the async data was cached, we might have an iov cached inside. */
- if (hdr->free_iov)
+ if (hdr->vec.iovec)
req->flags |= REQ_F_NEED_CLEANUP;
return hdr;
}
@@ -182,10 +179,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg
{
if (iov) {
req->flags |= REQ_F_NEED_CLEANUP;
- kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- kmsg->free_iov = iov;
+ io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs);
}
}
@@ -208,9 +202,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg
struct iovec *iov;
int ret, nr_segs;
- if (iomsg->free_iov) {
- nr_segs = iomsg->free_iov_nr;
- iov = iomsg->free_iov;
+ if (iomsg->vec.iovec) {
+ nr_segs = iomsg->vec.nr;
+ iov = iomsg->vec.iovec;
} else {
nr_segs = 1;
iov = &iomsg->fast_iov;
@@ -468,7 +462,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
if (iter_is_ubuf(&kmsg->msg.msg_iter))
return 1;
- iov = kmsg->free_iov;
+ iov = kmsg->vec.iovec;
if (!iov)
iov = &kmsg->fast_iov;
@@ -584,9 +578,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.nr_iovs = 1,
};
- if (kmsg->free_iov) {
- arg.nr_iovs = kmsg->free_iov_nr;
- arg.iovs = kmsg->free_iov;
+ if (kmsg->vec.iovec) {
+ arg.nr_iovs = kmsg->vec.nr;
+ arg.iovs = kmsg->vec.iovec;
arg.mode = KBUF_MODE_FREE;
}
@@ -599,9 +593,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
if (unlikely(ret < 0))
return ret;
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
- kmsg->free_iov_nr = ret;
- kmsg->free_iov = arg.iovs;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
sr->len = arg.out_len;
@@ -1085,9 +1079,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.mode = KBUF_MODE_EXPAND,
};
- if (kmsg->free_iov) {
- arg.nr_iovs = kmsg->free_iov_nr;
- arg.iovs = kmsg->free_iov;
+ if (kmsg->vec.iovec) {
+ arg.nr_iovs = kmsg->vec.nr;
+ arg.iovs = kmsg->vec.iovec;
arg.mode |= KBUF_MODE_FREE;
}
@@ -1106,9 +1100,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
}
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
- kmsg->free_iov_nr = ret;
- kmsg->free_iov = arg.iovs;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
} else {
@@ -1874,8 +1868,7 @@ void io_netmsg_cache_free(const void *entry)
{
struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
- if (kmsg->free_iov)
- io_netmsg_iovec_free(kmsg);
+ io_vec_free(&kmsg->vec);
kfree(kmsg);
}
#endif
diff --git a/io_uring/net.h b/io_uring/net.h
index b804c2b36e60..43e5ce5416b7 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -2,12 +2,12 @@
#include <linux/net.h>
#include <linux/uio.h>
+#include <linux/io_uring_types.h>
struct io_async_msghdr {
#if defined(CONFIG_NET)
- struct iovec *free_iov;
- /* points to an allocated iov, if NULL we use fast_iov instead */
- int free_iov_nr;
+ struct iou_vec vec;
+
struct_group(clear,
int namelen;
struct iovec fast_iov;
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 8/9] io_uring/net: implement vectored reg bufs for zctx
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (6 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 7/9] io_uring/net: convert to struct iou_vec Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 9/9] io_uring: cap cached iovec/bvec size Pavel Begunkov
2025-03-06 12:01 ` [PATCH v2 0/9] Add support for vectored registered buffers Jens Axboe
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Add support for vectored registered buffers for send zc.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/net.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 55 insertions(+), 4 deletions(-)
diff --git a/io_uring/net.c b/io_uring/net.c
index a4b39343f345..5e27c22e1d58 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -395,6 +395,44 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
return io_sendmsg_copy_hdr(req, kmsg);
}
+static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *kmsg = req->async_data;
+ struct user_msghdr msg;
+ int ret, iovec_off;
+ struct iovec *iov;
+ void *res;
+
+ if (!(sr->flags & IORING_RECVSEND_FIXED_BUF))
+ return io_sendmsg_setup(req, sqe);
+
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+ ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
+ if (unlikely(ret))
+ return ret;
+ sr->msg_control = kmsg->msg.msg_control_user;
+
+ if (msg.msg_iovlen > kmsg->vec.nr || WARN_ON_ONCE(!kmsg->vec.iovec)) {
+ ret = io_vec_realloc(&kmsg->vec, msg.msg_iovlen);
+ if (ret)
+ return ret;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ iovec_off = kmsg->vec.nr - msg.msg_iovlen;
+ iov = kmsg->vec.iovec + iovec_off;
+
+ res = iovec_from_user(msg.msg_iov, msg.msg_iovlen, kmsg->vec.nr, iov,
+ io_is_compat(req->ctx));
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
+ req->flags |= REQ_F_IMPORT_BUFFER;
+ return ret;
+}
+
#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1333,8 +1371,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->opcode != IORING_OP_SEND_ZC) {
if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
- if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
- return -EINVAL;
}
zc->len = READ_ONCE(sqe->len);
@@ -1350,7 +1386,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -ENOMEM;
if (req->opcode != IORING_OP_SENDMSG_ZC)
return io_send_setup(req, sqe);
- return io_sendmsg_setup(req, sqe);
+ return io_sendmsg_zc_setup(req, sqe);
}
static int io_sg_from_iter_iovec(struct sk_buff *skb,
@@ -1506,6 +1542,22 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
unsigned flags;
int ret, min_ret = 0;
+ kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
+ unsigned iovec_off = kmsg->vec.nr - uvec_segs;
+ int ret;
+
+ ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req,
+ &kmsg->vec, uvec_segs, iovec_off,
+ issue_flags);
+ if (unlikely(ret))
+ return ret;
+ kmsg->msg.sg_from_iter = io_sg_from_iter;
+ req->flags &= ~REQ_F_IMPORT_BUFFER;
+ }
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -1524,7 +1576,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
kmsg->msg.msg_control_user = sr->msg_control;
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
- kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (unlikely(ret < min_ret)) {
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH v2 9/9] io_uring: cap cached iovec/bvec size
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (7 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 8/9] io_uring/net: implement vectored reg bufs for zctx Pavel Begunkov
@ 2025-03-04 15:40 ` Pavel Begunkov
2025-03-06 12:01 ` [PATCH v2 0/9] Add support for vectored registered buffers Jens Axboe
9 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-04 15:40 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Andres Freund
Bvecs can be large, put an arbitrary limit on the max vector size it
can cache.
Signed-off-by: Pavel Begunkov <[email protected]>
---
io_uring/net.c | 3 +++
io_uring/rsrc.h | 2 ++
io_uring/rw.c | 3 +++
3 files changed, 8 insertions(+)
diff --git a/io_uring/net.c b/io_uring/net.c
index 5e27c22e1d58..ce104d04b1e4 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -152,6 +152,9 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
/* Let normal cleanup path reap it if we fail adding to the cache */
io_alloc_cache_vec_kasan(&hdr->vec);
+ if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
+ io_vec_free(&hdr->vec);
+
if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f1496f7d844f..0bfcdba12617 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -6,6 +6,8 @@
#include <linux/lockdep.h>
#include <linux/io_uring_types.h>
+#define IO_VEC_CACHE_SOFT_CAP 256
+
enum {
IORING_RSRC_FILE = 0,
IORING_RSRC_BUFFER = 1,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index e62f4ce34171..bf35599d1078 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -150,6 +150,9 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
return;
io_alloc_cache_vec_kasan(&rw->vec);
+ if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP)
+ io_vec_free(&rw->vec);
+
if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH v2 0/9] Add support for vectored registered buffers
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
` (8 preceding siblings ...)
2025-03-04 15:40 ` [PATCH v2 9/9] io_uring: cap cached iovec/bvec size Pavel Begunkov
@ 2025-03-06 12:01 ` Jens Axboe
2025-03-06 12:10 ` Jens Axboe
9 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2025-03-06 12:01 UTC (permalink / raw)
To: io-uring, Pavel Begunkov; +Cc: Andres Freund
On Tue, 04 Mar 2025 15:40:21 +0000, Pavel Begunkov wrote:
> Add registered buffer support for vectored io_uring operations. That
> allows to pass an iovec, all entries of which must belong to and
> point into the same registered buffer specified by sqe->buf_index.
>
> The series covers zerocopy sendmsg and reads / writes. Reads and
> writes are implemented as new opcodes, while zerocopy sendmsg
> reuses IORING_RECVSEND_FIXED_BUF for the api.
>
> [...]
Applied, thanks!
[1/9] io_uring: introduce struct iou_vec
commit: 32fd3277b4ae0f5e6f3a306b464f9b031e2408a8
[2/9] io_uring: add infra for importing vectored reg buffers
commit: 1a3339cbca2225dbcdc1f4da2b25ab83da818f1d
[3/9] io_uring/rw: implement vectored registered rw
commit: 7965e1cd6199cf9c87fa02e904cbc50c45c7310f
[4/9] io_uring/rw: defer reg buf vec import
commit: 5f0a1f815dad9490db822013a2f1feba3371f4d1
[5/9] io_uring/net: combine msghdr copy
commit: bc007e0aea60926b75b6a459ad8cf7ac357fb290
[6/9] io_uring/net: pull vec alloc out of msghdr import
commit: 8ff671f394f97e31bc6c1acec9ebbdb108177df9
[7/9] io_uring/net: convert to struct iou_vec
commit: 57b309177530bf99e59da21d1b1888ac4024072a
[8/9] io_uring/net: implement vectored reg bufs for zctx
commit: 6836bdad87cb83e96df0702d02d264224b0ffd2d
[9/9] io_uring: cap cached iovec/bvec size
commit: 0be2ba0a44e3670ac3f9eecd674341d77767288d
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 0/9] Add support for vectored registered buffers
2025-03-06 12:01 ` [PATCH v2 0/9] Add support for vectored registered buffers Jens Axboe
@ 2025-03-06 12:10 ` Jens Axboe
2025-03-06 22:59 ` Jens Axboe
0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2025-03-06 12:10 UTC (permalink / raw)
To: io-uring, Pavel Begunkov; +Cc: Andres Freund
On 3/6/25 5:01 AM, Jens Axboe wrote:
>
> On Tue, 04 Mar 2025 15:40:21 +0000, Pavel Begunkov wrote:
>> Add registered buffer support for vectored io_uring operations. That
>> allows to pass an iovec, all entries of which must belong to and
>> point into the same registered buffer specified by sqe->buf_index.
>>
>> The series covers zerocopy sendmsg and reads / writes. Reads and
>> writes are implemented as new opcodes, while zerocopy sendmsg
>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>
>> [...]
>
> Applied, thanks!
>
> [1/9] io_uring: introduce struct iou_vec
> commit: 32fd3277b4ae0f5e6f3a306b464f9b031e2408a8
> [2/9] io_uring: add infra for importing vectored reg buffers
> commit: 1a3339cbca2225dbcdc1f4da2b25ab83da818f1d
> [3/9] io_uring/rw: implement vectored registered rw
> commit: 7965e1cd6199cf9c87fa02e904cbc50c45c7310f
> [4/9] io_uring/rw: defer reg buf vec import
> commit: 5f0a1f815dad9490db822013a2f1feba3371f4d1
> [5/9] io_uring/net: combine msghdr copy
> commit: bc007e0aea60926b75b6a459ad8cf7ac357fb290
> [6/9] io_uring/net: pull vec alloc out of msghdr import
> commit: 8ff671f394f97e31bc6c1acec9ebbdb108177df9
> [7/9] io_uring/net: convert to struct iou_vec
> commit: 57b309177530bf99e59da21d1b1888ac4024072a
> [8/9] io_uring/net: implement vectored reg bufs for zctx
> commit: 6836bdad87cb83e96df0702d02d264224b0ffd2d
> [9/9] io_uring: cap cached iovec/bvec size
> commit: 0be2ba0a44e3670ac3f9eecd674341d77767288d
Note: the vectored fixed read/write opcodes got renumbered, as they
didn't sit on top of the epoll wait patches. Just a heads up, in terms
of the liburing side.
I'll get the basic epoll wait bits pushed up to liburing as well.
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 0/9] Add support for vectored registered buffers
2025-03-06 12:10 ` Jens Axboe
@ 2025-03-06 22:59 ` Jens Axboe
2025-03-07 14:14 ` Pavel Begunkov
0 siblings, 1 reply; 16+ messages in thread
From: Jens Axboe @ 2025-03-06 22:59 UTC (permalink / raw)
To: io-uring, Pavel Begunkov; +Cc: Andres Freund
On 3/6/25 5:10 AM, Jens Axboe wrote:
> On 3/6/25 5:01 AM, Jens Axboe wrote:
>>
>> On Tue, 04 Mar 2025 15:40:21 +0000, Pavel Begunkov wrote:
>>> Add registered buffer support for vectored io_uring operations. That
>>> allows to pass an iovec, all entries of which must belong to and
>>> point into the same registered buffer specified by sqe->buf_index.
>>>
>>> The series covers zerocopy sendmsg and reads / writes. Reads and
>>> writes are implemented as new opcodes, while zerocopy sendmsg
>>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>>
>>> [...]
>>
>> Applied, thanks!
>>
>> [1/9] io_uring: introduce struct iou_vec
>> commit: 32fd3277b4ae0f5e6f3a306b464f9b031e2408a8
>> [2/9] io_uring: add infra for importing vectored reg buffers
>> commit: 1a3339cbca2225dbcdc1f4da2b25ab83da818f1d
>> [3/9] io_uring/rw: implement vectored registered rw
>> commit: 7965e1cd6199cf9c87fa02e904cbc50c45c7310f
>> [4/9] io_uring/rw: defer reg buf vec import
>> commit: 5f0a1f815dad9490db822013a2f1feba3371f4d1
>> [5/9] io_uring/net: combine msghdr copy
>> commit: bc007e0aea60926b75b6a459ad8cf7ac357fb290
>> [6/9] io_uring/net: pull vec alloc out of msghdr import
>> commit: 8ff671f394f97e31bc6c1acec9ebbdb108177df9
>> [7/9] io_uring/net: convert to struct iou_vec
>> commit: 57b309177530bf99e59da21d1b1888ac4024072a
>> [8/9] io_uring/net: implement vectored reg bufs for zctx
>> commit: 6836bdad87cb83e96df0702d02d264224b0ffd2d
>> [9/9] io_uring: cap cached iovec/bvec size
>> commit: 0be2ba0a44e3670ac3f9eecd674341d77767288d
>
> Note: the vectored fixed read/write opcodes got renumbered, as they
> didn't sit on top of the epoll wait patches. Just a heads up, in terms
> of the liburing side.
>
> I'll get the basic epoll wait bits pushed up to liburing as well.
And one more note: this breaks 32-bit compiles due to a bad assumption
on iovec vs bio_vec sizing, so I've dropped it for now. Hopefully we can
get a v3 into the 6.15 branch.
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers
2025-03-04 15:40 ` [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
@ 2025-03-07 14:07 ` Pavel Begunkov
2025-03-07 14:14 ` Jens Axboe
0 siblings, 1 reply; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-07 14:07 UTC (permalink / raw)
To: io-uring; +Cc: Andres Freund
On 3/4/25 15:40, Pavel Begunkov wrote:
> Add io_import_reg_vec(), which will be responsible for importing
> vectored registered buffers. iovecs are overlapped with the resulting
> bvec in memory, which is why the iovec is expected to be padded in
> iou_vec.
>
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
...
> +int io_import_reg_vec(int ddir, struct iov_iter *iter,
> + struct io_kiocb *req, struct iou_vec *vec,
> + unsigned nr_iovs, unsigned iovec_off,
> + unsigned issue_flags)
> +{
> + struct io_rsrc_node *node;
> + struct io_mapped_ubuf *imu;
> + struct iovec *iov;
> + unsigned nr_segs;
> +
> + node = io_find_buf_node(req, issue_flags);
> + if (!node)
> + return -EFAULT;
> + imu = node->buf;
> + if (imu->is_kbuf)
> + return -EOPNOTSUPP;
> + if (!(imu->dir & (1 << ddir)))
> + return -EFAULT;
> +
> + iov = vec->iovec + iovec_off;
> + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
size_t entry_sz = sizeof(struct iovec);
size_t bvec_bytes = nr_segs * sizeof(struct bio_vec);
size_t iovec_off = (bvec_bytes + entry_sz - 1) / entry_sz;
nr_segs += iovec_off;
}
How about fixing it up like this for now? Instead of overlapping
bvec with iovec, it'd put them back to back and waste some memory
on 32bit.
I can try to make it a bit tighter, remove the if and let
the compiler to optimise it into no-op for x64, or allocate
max(bvec, iovec) * nr and see where it leads. But in either
way IMHO it's better to be left until I get more time.
> +
> + if (WARN_ON_ONCE(iovec_off + nr_iovs != vec->nr) ||
> + nr_segs > vec->nr) {
> + struct iou_vec tmp_vec = {};
> + int ret;
> +
> + ret = io_vec_realloc(&tmp_vec, nr_segs);
> + if (ret)
> + return ret;
> +
> + iovec_off = tmp_vec.nr - nr_iovs;
> + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
> + io_vec_free(vec);
> +
> + *vec = tmp_vec;
> + iov = vec->iovec + iovec_off;
> + req->flags |= REQ_F_NEED_CLEANUP;
> + }
> +
> + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
> +}
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 0/9] Add support for vectored registered buffers
2025-03-06 22:59 ` Jens Axboe
@ 2025-03-07 14:14 ` Pavel Begunkov
0 siblings, 0 replies; 16+ messages in thread
From: Pavel Begunkov @ 2025-03-07 14:14 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: Andres Freund
On 3/6/25 22:59, Jens Axboe wrote:
> On 3/6/25 5:10 AM, Jens Axboe wrote:
>> On 3/6/25 5:01 AM, Jens Axboe wrote:
>>>
>>> On Tue, 04 Mar 2025 15:40:21 +0000, Pavel Begunkov wrote:
>>>> Add registered buffer support for vectored io_uring operations. That
>>>> allows to pass an iovec, all entries of which must belong to and
>>>> point into the same registered buffer specified by sqe->buf_index.
>>>>
>>>> The series covers zerocopy sendmsg and reads / writes. Reads and
>>>> writes are implemented as new opcodes, while zerocopy sendmsg
>>>> reuses IORING_RECVSEND_FIXED_BUF for the api.
>>>>
>>>> [...]
>>>
>>> Applied, thanks!
>>>
>>> [1/9] io_uring: introduce struct iou_vec
>>> commit: 32fd3277b4ae0f5e6f3a306b464f9b031e2408a8
>>> [2/9] io_uring: add infra for importing vectored reg buffers
>>> commit: 1a3339cbca2225dbcdc1f4da2b25ab83da818f1d
>>> [3/9] io_uring/rw: implement vectored registered rw
>>> commit: 7965e1cd6199cf9c87fa02e904cbc50c45c7310f
>>> [4/9] io_uring/rw: defer reg buf vec import
>>> commit: 5f0a1f815dad9490db822013a2f1feba3371f4d1
>>> [5/9] io_uring/net: combine msghdr copy
>>> commit: bc007e0aea60926b75b6a459ad8cf7ac357fb290
>>> [6/9] io_uring/net: pull vec alloc out of msghdr import
>>> commit: 8ff671f394f97e31bc6c1acec9ebbdb108177df9
>>> [7/9] io_uring/net: convert to struct iou_vec
>>> commit: 57b309177530bf99e59da21d1b1888ac4024072a
>>> [8/9] io_uring/net: implement vectored reg bufs for zctx
>>> commit: 6836bdad87cb83e96df0702d02d264224b0ffd2d
>>> [9/9] io_uring: cap cached iovec/bvec size
>>> commit: 0be2ba0a44e3670ac3f9eecd674341d77767288d
>>
>> Note: the vectored fixed read/write opcodes got renumbered, as they
>> didn't sit on top of the epoll wait patches. Just a heads up, in terms
>> of the liburing side.
>>
>> I'll get the basic epoll wait bits pushed up to liburing as well.
>
> And one more note: this breaks 32-bit compiles due to a bad assumption
> on iovec vs bio_vec sizing, so I've dropped it for now. Hopefully we can
> get a v3 into the 6.15 branch.
I saw that, at least the build check did the job. I'm inclined
to the option in my reply to 2/9. It's generic in a sense that
it can be unconditional, might be a good idea to kill the
if and have extra memory consumption for now until improved.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers
2025-03-07 14:07 ` Pavel Begunkov
@ 2025-03-07 14:14 ` Jens Axboe
0 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2025-03-07 14:14 UTC (permalink / raw)
To: Pavel Begunkov, io-uring; +Cc: Andres Freund
On 3/7/25 7:07 AM, Pavel Begunkov wrote:
> On 3/4/25 15:40, Pavel Begunkov wrote:
>> Add io_import_reg_vec(), which will be responsible for importing
>> vectored registered buffers. iovecs are overlapped with the resulting
>> bvec in memory, which is why the iovec is expected to be padded in
>> iou_vec.
>>
>> Signed-off-by: Pavel Begunkov <[email protected]>
>> ---
> ...
>> +int io_import_reg_vec(int ddir, struct iov_iter *iter,
>> + struct io_kiocb *req, struct iou_vec *vec,
>> + unsigned nr_iovs, unsigned iovec_off,
>> + unsigned issue_flags)
>> +{
>> + struct io_rsrc_node *node;
>> + struct io_mapped_ubuf *imu;
>> + struct iovec *iov;
>> + unsigned nr_segs;
>> +
>> + node = io_find_buf_node(req, issue_flags);
>> + if (!node)
>> + return -EFAULT;
>> + imu = node->buf;
>> + if (imu->is_kbuf)
>> + return -EOPNOTSUPP;
>> + if (!(imu->dir & (1 << ddir)))
>> + return -EFAULT;
>> +
>> + iov = vec->iovec + iovec_off;
>> + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
>
> if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
> size_t entry_sz = sizeof(struct iovec);
> size_t bvec_bytes = nr_segs * sizeof(struct bio_vec);
> size_t iovec_off = (bvec_bytes + entry_sz - 1) / entry_sz;
>
> nr_segs += iovec_off;
> }
>
> How about fixing it up like this for now? Instead of overlapping
> bvec with iovec, it'd put them back to back and waste some memory
> on 32bit.
>
> I can try to make it a bit tighter, remove the if and let
> the compiler to optimise it into no-op for x64, or allocate
> max(bvec, iovec) * nr and see where it leads. But in either
> way IMHO it's better to be left until I get more time.
I think that looks reasonable. Nobody cares about this for 32-bit,
outside of needing to work obviously.
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2025-03-07 14:14 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-04 15:40 [PATCH v2 0/9] Add support for vectored registered buffers Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 1/9] io_uring: introduce struct iou_vec Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 2/9] io_uring: add infra for importing vectored reg buffers Pavel Begunkov
2025-03-07 14:07 ` Pavel Begunkov
2025-03-07 14:14 ` Jens Axboe
2025-03-04 15:40 ` [PATCH v2 3/9] io_uring/rw: implement vectored registered rw Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 4/9] io_uring/rw: defer reg buf vec import Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 5/9] io_uring/net: combine msghdr copy Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 6/9] io_uring/net: pull vec alloc out of msghdr import Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 7/9] io_uring/net: convert to struct iou_vec Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 8/9] io_uring/net: implement vectored reg bufs for zctx Pavel Begunkov
2025-03-04 15:40 ` [PATCH v2 9/9] io_uring: cap cached iovec/bvec size Pavel Begunkov
2025-03-06 12:01 ` [PATCH v2 0/9] Add support for vectored registered buffers Jens Axboe
2025-03-06 12:10 ` Jens Axboe
2025-03-06 22:59 ` Jens Axboe
2025-03-07 14:14 ` Pavel Begunkov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox