* [PATCHSET v2 0/2] Add support for cloning partial buffer sets
@ 2024-10-31 1:44 Jens Axboe
2024-10-31 1:44 ` [PATCH 1/2] io_uring/rsrc: allow cloning at an offset Jens Axboe
2024-10-31 1:44 ` [PATCH 2/2] io_uring/rsrc: allow cloning with node replacements Jens Axboe
0 siblings, 2 replies; 3+ messages in thread
From: Jens Axboe @ 2024-10-31 1:44 UTC (permalink / raw)
To: io-uring
Hi,
6.12 added buffer cloning support, but it's an all-or-nothing kind of
thing - if there's an existing buffer table in the destination ring,
then nothing can be cloned to it.
This adds support for cloning partial buffer sets, specifying a
source/dest offset and the number of buffers to clone. And it allows
cloning to replace existing nodes as well, specified with a separate
flag.
Changes since v1:
- Rebase on current tree (no rsrc_empty_node)
- Rewrite the replacement code to be much simpler
- Write more test cases
--
Jens Axboe
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH 1/2] io_uring/rsrc: allow cloning at an offset
2024-10-31 1:44 [PATCHSET v2 0/2] Add support for cloning partial buffer sets Jens Axboe
@ 2024-10-31 1:44 ` Jens Axboe
2024-10-31 1:44 ` [PATCH 2/2] io_uring/rsrc: allow cloning with node replacements Jens Axboe
1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2024-10-31 1:44 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
Right now buffer cloning is an all-or-nothing kind of thing - either the
whole table is cloned from a source to a destination ring, or nothing at
all.
However, it's not always desired to clone the whole thing. Allow for
the application to specify a source and destination offset, and a
number of buffers to clone. If the destination offset is non-zero, then
allocate sparse nodes upfront.
Signed-off-by: Jens Axboe <[email protected]>
---
include/uapi/linux/io_uring.h | 5 ++++-
io_uring/rsrc.c | 32 ++++++++++++++++++++++++++------
2 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 024745283783..cc8dbe78c126 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -719,7 +719,10 @@ enum {
struct io_uring_clone_buffers {
__u32 src_fd;
__u32 flags;
- __u32 pad[6];
+ __u32 src_off;
+ __u32 dst_off;
+ __u32 nr;
+ __u32 pad[3];
};
struct io_uring_buf {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index af60d9f597be..d00870128bb9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -924,10 +924,11 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
return 0;
}
-static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
+static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
+ struct io_uring_clone_buffers *arg)
{
+ int i, ret, nbufs, off, nr;
struct io_rsrc_data data;
- int i, ret, nbufs;
/*
* Drop our own lock here. We'll setup the data we need and reference
@@ -940,11 +941,29 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
nbufs = src_ctx->buf_table.nr;
if (!nbufs)
goto out_unlock;
- ret = io_rsrc_data_alloc(&data, nbufs);
+ ret = -EINVAL;
+ if (!arg->nr)
+ arg->nr = nbufs;
+ else if (arg->nr > nbufs)
+ goto out_unlock;
+ ret = -EOVERFLOW;
+ if (check_add_overflow(arg->nr, arg->src_off, &off))
+ goto out_unlock;
+ if (off > nbufs)
+ goto out_unlock;
+ if (check_add_overflow(arg->nr, arg->dst_off, &off))
+ goto out_unlock;
+ ret = -EINVAL;
+ if (off > IORING_MAX_REG_BUFFERS)
+ goto out_unlock;
+ ret = io_rsrc_data_alloc(&data, off);
if (ret)
goto out_unlock;
- for (i = 0; i < nbufs; i++) {
+ off = arg->dst_off;
+ i = arg->src_off;
+ nr = arg->nr;
+ while (nr--) {
struct io_rsrc_node *dst_node, *src_node;
src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
@@ -960,7 +979,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
refcount_inc(&src_node->buf->refs);
dst_node->buf = src_node->buf;
}
- data.nodes[i] = dst_node;
+ data.nodes[off++] = dst_node;
+ i++;
}
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
@@ -1015,7 +1035,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
- ret = io_clone_buffers(ctx, file->private_data);
+ ret = io_clone_buffers(ctx, file->private_data, &buf);
if (!registered_src)
fput(file);
return ret;
--
2.45.2
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH 2/2] io_uring/rsrc: allow cloning with node replacements
2024-10-31 1:44 [PATCHSET v2 0/2] Add support for cloning partial buffer sets Jens Axboe
2024-10-31 1:44 ` [PATCH 1/2] io_uring/rsrc: allow cloning at an offset Jens Axboe
@ 2024-10-31 1:44 ` Jens Axboe
1 sibling, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2024-10-31 1:44 UTC (permalink / raw)
To: io-uring; +Cc: Jens Axboe
Currently cloning a buffer table will fail if the destination already has
a table. But it should be possible to use it to replace existing elements.
Add a IORING_REGISTER_DST_REPLACE cloning flag, which if set, will allow
the destination to already having a buffer table. If that is the case,
then entries designated by offset + nr buffers will be replaced if they
already exist.
Note that it's allowed to use IORING_REGISTER_DST_REPLACE and not have
an existing table, in which case it'll work just like not having the
flag set and an empty table - it'll just assign the newly created table
for that case.
Signed-off-by: Jens Axboe <[email protected]>
---
include/uapi/linux/io_uring.h | 3 +-
io_uring/rsrc.c | 66 +++++++++++++++++++++++++++--------
2 files changed, 54 insertions(+), 15 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cc8dbe78c126..ce58c4590de6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -713,7 +713,8 @@ struct io_uring_clock_register {
};
enum {
- IORING_REGISTER_SRC_REGISTERED = 1,
+ IORING_REGISTER_SRC_REGISTERED = (1U << 0),
+ IORING_REGISTER_DST_REPLACE = (1U << 1),
};
struct io_uring_clone_buffers {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d00870128bb9..673ff00da727 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -927,8 +927,40 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
struct io_uring_clone_buffers *arg)
{
- int i, ret, nbufs, off, nr;
struct io_rsrc_data data;
+ int i, ret, off, nr;
+ unsigned int nbufs;
+
+ /* if offsets are given, must have nr specified too */
+ if (!arg->nr && (arg->dst_off || arg->src_off))
+ return -EINVAL;
+ /* not allowed unless REPLACE is set */
+ if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
+ return -EBUSY;
+
+ nbufs = READ_ONCE(src_ctx->buf_table.nr);
+ if (!arg->nr)
+ arg->nr = nbufs;
+ else if (arg->nr > nbufs)
+ return -EINVAL;
+ else if (arg->nr > IORING_MAX_REG_BUFFERS)
+ return -EINVAL;
+ if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
+ return -EOVERFLOW;
+
+ ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
+ if (ret)
+ return ret;
+
+ /* Fill entries in data from dst that won't overlap with src */
+ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
+ struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+
+ if (src_node) {
+ data.nodes[i] = src_node;
+ src_node->refs++;
+ }
+ }
/*
* Drop our own lock here. We'll setup the data we need and reference
@@ -951,14 +983,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
goto out_unlock;
if (off > nbufs)
goto out_unlock;
- if (check_add_overflow(arg->nr, arg->dst_off, &off))
- goto out_unlock;
- ret = -EINVAL;
- if (off > IORING_MAX_REG_BUFFERS)
- goto out_unlock;
- ret = io_rsrc_data_alloc(&data, off);
- if (ret)
- goto out_unlock;
off = arg->dst_off;
i = arg->src_off;
@@ -986,6 +1010,20 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
+
+ /*
+ * If asked for replace, put the old table. data->nodes[] holds both
+ * old and new nodes at this point.
+ */
+ if (arg->flags & IORING_REGISTER_DST_REPLACE)
+ io_rsrc_data_free(&ctx->buf_table);
+
+ /*
+ * ctx->buf_table should be empty now - either the contents are being
+ * replaced and we just freed the table, or someone raced setting up
+ * a buffer table while the clone was happening. If not empty, fall
+ * through to failure handling.
+ */
if (!ctx->buf_table.nr) {
ctx->buf_table = data;
return 0;
@@ -995,14 +1033,14 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
mutex_lock(&src_ctx->uring_lock);
/* someone raced setting up buffers, dump ours */
ret = -EBUSY;
- i = nbufs;
out_put_free:
+ i = data.nr;
while (i--) {
io_buffer_unmap(src_ctx, data.nodes[i]);
kfree(data.nodes[i]);
}
- io_rsrc_data_free(&data);
out_unlock:
+ io_rsrc_data_free(&data);
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
return ret;
@@ -1022,12 +1060,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
struct file *file;
int ret;
- if (ctx->buf_table.nr)
- return -EBUSY;
if (copy_from_user(&buf, arg, sizeof(buf)))
return -EFAULT;
- if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
+ if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
return -EINVAL;
+ if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
+ return -EBUSY;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
--
2.45.2
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-10-31 1:46 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-31 1:44 [PATCHSET v2 0/2] Add support for cloning partial buffer sets Jens Axboe
2024-10-31 1:44 ` [PATCH 1/2] io_uring/rsrc: allow cloning at an offset Jens Axboe
2024-10-31 1:44 ` [PATCH 2/2] io_uring/rsrc: allow cloning with node replacements Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox