* [PATCHv3 1/5] io_uring: move fixed buffer import to issue path
2025-02-14 15:43 [PATCHv3 0/5] ublk zero-copy support Keith Busch
@ 2025-02-14 15:43 ` Keith Busch
2025-02-18 20:32 ` Caleb Sander Mateos
2025-02-14 15:43 ` [PATCHv3 2/5] io_uring: add support for kernel registered bvecs Keith Busch
` (3 subsequent siblings)
4 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2025-02-14 15:43 UTC (permalink / raw)
To: ming.lei, asml.silence, axboe, linux-block, io-uring; +Cc: bernd, Keith Busch
From: Keith Busch <[email protected]>
Similiar to the fixed file path, requests may depend on a previous
command to set up an index, so we need to allow linking them. The prep
callback happens too soon for linked commands, so the lookup needs to be
defered to the issue path. Change the prep callbacks to just set the
buf_index and let generic io_uring code handle the fixed buffer node
setup, just like it does for fixed files.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring_types.h | 3 +++
io_uring/io_uring.c | 19 ++++++++++++++
io_uring/net.c | 25 ++++++-------------
io_uring/nop.c | 22 +++--------------
io_uring/rw.c | 45 ++++++++++++++++++++++++----------
io_uring/uring_cmd.c | 16 ++----------
6 files changed, 67 insertions(+), 63 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e2fef264ff8b8..d5bf336882aa8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -469,6 +469,7 @@ enum {
REQ_F_DOUBLE_POLL_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
+ REQ_F_FIXED_BUFFER_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -561,6 +562,8 @@ enum {
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
/* request has read/write metadata assigned */
REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
+ /* request has a fixed buffer at buf_index */
+ REQ_F_FIXED_BUFFER = IO_REQ_FLAG(REQ_F_FIXED_BUFFER_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4a0944a57d963..a5be6ec99d153 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1720,6 +1720,23 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
return !!req->file;
}
+static bool io_assign_buffer(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
+
+ if (req->buf_node || !(req->flags & REQ_F_FIXED_BUFFER))
+ return true;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ node = io_rsrc_node_lookup(&ctx->buf_table.data, req->buf_index);
+ if (node)
+ io_req_assign_buf_node(req, node);
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ return !!node;
+}
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
@@ -1728,6 +1745,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!io_assign_file(req, def, issue_flags)))
return -EBADF;
+ if (unlikely(!io_assign_buffer(req, issue_flags)))
+ return -EFAULT;
if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
diff --git a/io_uring/net.c b/io_uring/net.c
index 10344b3a6d89c..0185925e40bfb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1299,6 +1299,10 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#endif
if (unlikely(!io_msg_alloc_async(req)))
return -ENOMEM;
+ if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
+ req->buf_index = zc->buf_index;
+ req->flags |= REQ_F_FIXED_BUFFER;
+ }
if (req->opcode != IORING_OP_SENDMSG_ZC)
return io_send_setup(req, sqe);
return io_sendmsg_setup(req, sqe);
@@ -1360,25 +1364,10 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
struct io_async_msghdr *kmsg = req->async_data;
int ret;
- if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_rsrc_node *node;
-
- ret = -EFAULT;
- io_ring_submit_lock(ctx, issue_flags);
- node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
- if (node) {
- io_req_assign_buf_node(sr->notif, node);
- ret = 0;
- }
- io_ring_submit_unlock(ctx, issue_flags);
-
- if (unlikely(ret))
- return ret;
-
+ if (req->flags & REQ_F_FIXED_BUFFER) {
ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
- node->buf, (u64)(uintptr_t)sr->buf,
- sr->len);
+ req->buf_node->buf,
+ (u64)(uintptr_t)sr->buf, sr->len);
if (unlikely(ret))
return ret;
kmsg->msg.sg_from_iter = io_sg_from_iter;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 5e5196df650a1..989908603112f 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -16,7 +16,6 @@ struct io_nop {
struct file *file;
int result;
int fd;
- int buffer;
unsigned int flags;
};
@@ -39,10 +38,10 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
nop->fd = READ_ONCE(sqe->fd);
else
nop->fd = -1;
- if (nop->flags & IORING_NOP_FIXED_BUFFER)
- nop->buffer = READ_ONCE(sqe->buf_index);
- else
- nop->buffer = -1;
+ if (nop->flags & IORING_NOP_FIXED_BUFFER) {
+ req->buf_index = READ_ONCE(sqe->buf_index);
+ req->flags |= REQ_F_FIXED_BUFFER;
+ }
return 0;
}
@@ -63,19 +62,6 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
goto done;
}
}
- if (nop->flags & IORING_NOP_FIXED_BUFFER) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_rsrc_node *node;
-
- ret = -EFAULT;
- io_ring_submit_lock(ctx, issue_flags);
- node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
- if (node) {
- io_req_assign_buf_node(req, node);
- ret = 0;
- }
- io_ring_submit_unlock(ctx, issue_flags);
- }
done:
if (ret < 0)
req_set_fail(req);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 7aa1e4c9f64a3..f37cd883d1625 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -353,25 +353,14 @@ int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir)
{
- struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_rsrc_node *node;
- struct io_async_rw *io;
int ret;
ret = io_prep_rw(req, sqe, ddir, false);
if (unlikely(ret))
return ret;
- node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
- if (!node)
- return -EFAULT;
- io_req_assign_buf_node(req, node);
-
- io = req->async_data;
- ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
- iov_iter_save_state(&io->iter, &io->iter_state);
- return ret;
+ req->flags |= REQ_F_FIXED_BUFFER;
+ return 0;
}
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -954,10 +943,36 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
+static int io_import_fixed_buffer(struct io_kiocb *req, int ddir)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io;
+ int ret;
+
+ if (!(req->flags & REQ_F_FIXED_BUFFER))
+ return 0;
+
+ io = req->async_data;
+ if (io->bytes_done)
+ return 0;
+
+ ret = io_import_fixed(ddir, &io->iter, req->buf_node->buf, rw->addr,
+ rw->len);
+ if (ret)
+ return ret;
+
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ return 0;
+}
+
int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
+ ret = io_import_fixed_buffer(req, READ);
+ if (unlikely(ret))
+ return ret;
+
ret = __io_read(req, issue_flags);
if (ret >= 0)
return kiocb_done(req, ret, issue_flags);
@@ -1062,6 +1077,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret, ret2;
loff_t *ppos;
+ ret = io_import_fixed_buffer(req, WRITE);
+ if (unlikely(ret))
+ return ret;
+
ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
if (unlikely(ret))
return ret;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 1f6a82128b475..70210b4e0b0f6 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -202,19 +202,8 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_rsrc_node *node;
- u16 index = READ_ONCE(sqe->buf_index);
-
- node = io_rsrc_node_lookup(&ctx->buf_table, index);
- if (unlikely(!node))
- return -EFAULT;
- /*
- * Pi node upfront, prior to io_uring_cmd_import_fixed()
- * being called. This prevents destruction of the mapped buffer
- * we'll need at actual import time.
- */
- io_req_assign_buf_node(req, node);
+ req->buf_index = READ_ONCE(sqe->buf_index);
+ req->flags |= REQ_F_FIXED_BUFFER;
}
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
@@ -272,7 +261,6 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
struct io_rsrc_node *node = req->buf_node;
- /* Must have had rsrc_node assigned at prep time */
if (node)
return io_import_fixed(rw, iter, node->buf, ubuf, len);
--
2.43.5
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCHv3 1/5] io_uring: move fixed buffer import to issue path
2025-02-14 15:43 ` [PATCHv3 1/5] io_uring: move fixed buffer import to issue path Keith Busch
@ 2025-02-18 20:32 ` Caleb Sander Mateos
0 siblings, 0 replies; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-18 20:32 UTC (permalink / raw)
To: Keith Busch
Cc: ming.lei, asml.silence, axboe, linux-block, io-uring, bernd,
Keith Busch
On Fri, Feb 14, 2025 at 7:45 AM Keith Busch <[email protected]> wrote:
>
> From: Keith Busch <[email protected]>
>
> Similiar to the fixed file path, requests may depend on a previous
typo: "Similiar" -> "Similar"
> command to set up an index, so we need to allow linking them. The prep
> callback happens too soon for linked commands, so the lookup needs to be
> defered to the issue path. Change the prep callbacks to just set the
typo: "defered" -> "deferred"
> buf_index and let generic io_uring code handle the fixed buffer node
> setup, just like it does for fixed files.
>
> Signed-off-by: Keith Busch <[email protected]>
> ---
> include/linux/io_uring_types.h | 3 +++
> io_uring/io_uring.c | 19 ++++++++++++++
> io_uring/net.c | 25 ++++++-------------
> io_uring/nop.c | 22 +++--------------
> io_uring/rw.c | 45 ++++++++++++++++++++++++----------
> io_uring/uring_cmd.c | 16 ++----------
> 6 files changed, 67 insertions(+), 63 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index e2fef264ff8b8..d5bf336882aa8 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -469,6 +469,7 @@ enum {
> REQ_F_DOUBLE_POLL_BIT,
> REQ_F_APOLL_MULTISHOT_BIT,
> REQ_F_CLEAR_POLLIN_BIT,
> + REQ_F_FIXED_BUFFER_BIT,
> /* keep async read/write and isreg together and in order */
> REQ_F_SUPPORT_NOWAIT_BIT,
> REQ_F_ISREG_BIT,
> @@ -561,6 +562,8 @@ enum {
> REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
> /* request has read/write metadata assigned */
> REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
> + /* request has a fixed buffer at buf_index */
> + REQ_F_FIXED_BUFFER = IO_REQ_FLAG(REQ_F_FIXED_BUFFER_BIT),
> };
>
> typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 4a0944a57d963..a5be6ec99d153 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -1720,6 +1720,23 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
> return !!req->file;
> }
>
> +static bool io_assign_buffer(struct io_kiocb *req, unsigned int issue_flags)
> +{
> + struct io_ring_ctx *ctx = req->ctx;
> + struct io_rsrc_node *node;
> +
> + if (req->buf_node || !(req->flags & REQ_F_FIXED_BUFFER))
> + return true;
> +
> + io_ring_submit_lock(ctx, issue_flags);
> + node = io_rsrc_node_lookup(&ctx->buf_table.data, req->buf_index);
This patch fails to compile on its own:
io_uring/io_uring.c: In function 'io_assign_buffer':
io_uring/io_uring.c:1894:51: error: 'struct io_rsrc_data' has no
member named 'data'
1894 | node = io_rsrc_node_lookup(&ctx->buf_table.data,
req->buf_index);
| ^
The data field appears to be added by the later patch "io_uring: add
abstraction for buf_table rsrc data". Probably .data should be dropped
in this patch and added in the later one instead.
Best,
Caleb
> + if (node)
> + io_req_assign_buf_node(req, node);
> + io_ring_submit_unlock(ctx, issue_flags);
> +
> + return !!node;
> +}
> +
> static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
> {
> const struct io_issue_def *def = &io_issue_defs[req->opcode];
> @@ -1728,6 +1745,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
>
> if (unlikely(!io_assign_file(req, def, issue_flags)))
> return -EBADF;
> + if (unlikely(!io_assign_buffer(req, issue_flags)))
> + return -EFAULT;
>
> if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
> creds = override_creds(req->creds);
> diff --git a/io_uring/net.c b/io_uring/net.c
> index 10344b3a6d89c..0185925e40bfb 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -1299,6 +1299,10 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> #endif
> if (unlikely(!io_msg_alloc_async(req)))
> return -ENOMEM;
> + if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
> + req->buf_index = zc->buf_index;
> + req->flags |= REQ_F_FIXED_BUFFER;
> + }
> if (req->opcode != IORING_OP_SENDMSG_ZC)
> return io_send_setup(req, sqe);
> return io_sendmsg_setup(req, sqe);
> @@ -1360,25 +1364,10 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
> struct io_async_msghdr *kmsg = req->async_data;
> int ret;
>
> - if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
> - struct io_ring_ctx *ctx = req->ctx;
> - struct io_rsrc_node *node;
> -
> - ret = -EFAULT;
> - io_ring_submit_lock(ctx, issue_flags);
> - node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
> - if (node) {
> - io_req_assign_buf_node(sr->notif, node);
> - ret = 0;
> - }
> - io_ring_submit_unlock(ctx, issue_flags);
> -
> - if (unlikely(ret))
> - return ret;
> -
> + if (req->flags & REQ_F_FIXED_BUFFER) {
> ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
> - node->buf, (u64)(uintptr_t)sr->buf,
> - sr->len);
> + req->buf_node->buf,
> + (u64)(uintptr_t)sr->buf, sr->len);
> if (unlikely(ret))
> return ret;
> kmsg->msg.sg_from_iter = io_sg_from_iter;
> diff --git a/io_uring/nop.c b/io_uring/nop.c
> index 5e5196df650a1..989908603112f 100644
> --- a/io_uring/nop.c
> +++ b/io_uring/nop.c
> @@ -16,7 +16,6 @@ struct io_nop {
> struct file *file;
> int result;
> int fd;
> - int buffer;
> unsigned int flags;
> };
>
> @@ -39,10 +38,10 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> nop->fd = READ_ONCE(sqe->fd);
> else
> nop->fd = -1;
> - if (nop->flags & IORING_NOP_FIXED_BUFFER)
> - nop->buffer = READ_ONCE(sqe->buf_index);
> - else
> - nop->buffer = -1;
> + if (nop->flags & IORING_NOP_FIXED_BUFFER) {
> + req->buf_index = READ_ONCE(sqe->buf_index);
> + req->flags |= REQ_F_FIXED_BUFFER;
> + }
> return 0;
> }
>
> @@ -63,19 +62,6 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
> goto done;
> }
> }
> - if (nop->flags & IORING_NOP_FIXED_BUFFER) {
> - struct io_ring_ctx *ctx = req->ctx;
> - struct io_rsrc_node *node;
> -
> - ret = -EFAULT;
> - io_ring_submit_lock(ctx, issue_flags);
> - node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
> - if (node) {
> - io_req_assign_buf_node(req, node);
> - ret = 0;
> - }
> - io_ring_submit_unlock(ctx, issue_flags);
> - }
> done:
> if (ret < 0)
> req_set_fail(req);
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 7aa1e4c9f64a3..f37cd883d1625 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -353,25 +353,14 @@ int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> int ddir)
> {
> - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> - struct io_ring_ctx *ctx = req->ctx;
> - struct io_rsrc_node *node;
> - struct io_async_rw *io;
> int ret;
>
> ret = io_prep_rw(req, sqe, ddir, false);
> if (unlikely(ret))
> return ret;
>
> - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
> - if (!node)
> - return -EFAULT;
> - io_req_assign_buf_node(req, node);
> -
> - io = req->async_data;
> - ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
> - iov_iter_save_state(&io->iter, &io->iter_state);
> - return ret;
> + req->flags |= REQ_F_FIXED_BUFFER;
> + return 0;
> }
>
> int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> @@ -954,10 +943,36 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
> return ret;
> }
>
> +static int io_import_fixed_buffer(struct io_kiocb *req, int ddir)
> +{
> + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> + struct io_async_rw *io;
> + int ret;
> +
> + if (!(req->flags & REQ_F_FIXED_BUFFER))
> + return 0;
> +
> + io = req->async_data;
> + if (io->bytes_done)
> + return 0;
> +
> + ret = io_import_fixed(ddir, &io->iter, req->buf_node->buf, rw->addr,
> + rw->len);
> + if (ret)
> + return ret;
> +
> + iov_iter_save_state(&io->iter, &io->iter_state);
> + return 0;
> +}
> +
> int io_read(struct io_kiocb *req, unsigned int issue_flags)
> {
> int ret;
>
> + ret = io_import_fixed_buffer(req, READ);
> + if (unlikely(ret))
> + return ret;
> +
> ret = __io_read(req, issue_flags);
> if (ret >= 0)
> return kiocb_done(req, ret, issue_flags);
> @@ -1062,6 +1077,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
> ssize_t ret, ret2;
> loff_t *ppos;
>
> + ret = io_import_fixed_buffer(req, WRITE);
> + if (unlikely(ret))
> + return ret;
> +
> ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
> if (unlikely(ret))
> return ret;
> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index 1f6a82128b475..70210b4e0b0f6 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -202,19 +202,8 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> return -EINVAL;
>
> if (ioucmd->flags & IORING_URING_CMD_FIXED) {
> - struct io_ring_ctx *ctx = req->ctx;
> - struct io_rsrc_node *node;
> - u16 index = READ_ONCE(sqe->buf_index);
> -
> - node = io_rsrc_node_lookup(&ctx->buf_table, index);
> - if (unlikely(!node))
> - return -EFAULT;
> - /*
> - * Pi node upfront, prior to io_uring_cmd_import_fixed()
> - * being called. This prevents destruction of the mapped buffer
> - * we'll need at actual import time.
> - */
> - io_req_assign_buf_node(req, node);
> + req->buf_index = READ_ONCE(sqe->buf_index);
> + req->flags |= REQ_F_FIXED_BUFFER;
> }
> ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
>
> @@ -272,7 +261,6 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
> struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
> struct io_rsrc_node *node = req->buf_node;
>
> - /* Must have had rsrc_node assigned at prep time */
> if (node)
> return io_import_fixed(rw, iter, node->buf, ubuf, len);
>
> --
> 2.43.5
>
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCHv3 2/5] io_uring: add support for kernel registered bvecs
2025-02-14 15:43 [PATCHv3 0/5] ublk zero-copy support Keith Busch
2025-02-14 15:43 ` [PATCHv3 1/5] io_uring: move fixed buffer import to issue path Keith Busch
@ 2025-02-14 15:43 ` Keith Busch
2025-02-14 20:38 ` Caleb Sander Mateos
2025-02-14 15:43 ` [PATCHv3 3/5] ublk: zc register/unregister bvec Keith Busch
` (2 subsequent siblings)
4 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2025-02-14 15:43 UTC (permalink / raw)
To: ming.lei, asml.silence, axboe, linux-block, io-uring; +Cc: bernd, Keith Busch
From: Keith Busch <[email protected]>
Provide an interface for the kernel to leverage the existing
pre-registered buffers that io_uring provides. User space can reference
these later to achieve zero-copy IO.
User space must register a sparse fixed buffer table with io_uring in
order for the kernel to make use of it. Kernel users of this interface
need to register a callback to know when the last reference is released.
io_uring uses the existence of this callback to differentiate user vs
kernel register buffers.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring.h | 1 +
include/linux/io_uring_types.h | 6 ++
io_uring/rsrc.c | 112 ++++++++++++++++++++++++++++++---
io_uring/rsrc.h | 2 +
4 files changed, 113 insertions(+), 8 deletions(-)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 85fe4e6b275c7..b5637a2aae340 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/xarray.h>
#include <uapi/linux/io_uring.h>
+#include <linux/blk-mq.h>
#if defined(CONFIG_IO_URING)
void __io_uring_cancel(bool cancel_all);
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d5bf336882aa8..b9feba4df60c9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -696,4 +696,10 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
return ctx->flags & IORING_SETUP_CQE32;
}
+int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
+ void (*release)(void *), unsigned int index,
+ unsigned int issue_flags);
+void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int tag,
+ unsigned int issue_flags);
+
#endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index af39b69eb4fde..0e323ca1e8e5c 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -103,19 +103,23 @@ static int io_buffer_validate(struct iovec *iov)
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- unsigned int i;
+ struct io_mapped_ubuf *imu = node->buf;
- if (node->buf) {
- struct io_mapped_ubuf *imu = node->buf;
+ if (!refcount_dec_and_test(&imu->refs))
+ return;
+
+ if (imu->release) {
+ imu->release(imu->priv);
+ } else {
+ unsigned int i;
- if (!refcount_dec_and_test(&imu->refs))
- return;
for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
- kvfree(imu);
}
+
+ kvfree(imu);
}
struct io_rsrc_node *io_rsrc_node_alloc(int type)
@@ -764,6 +768,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
imu->len = iov->iov_len;
imu->nr_bvecs = nr_pages;
imu->folio_shift = PAGE_SHIFT;
+ imu->release = NULL;
+ imu->priv = NULL;
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
@@ -860,6 +866,89 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
+ void (*release)(void *), unsigned int index,
+ unsigned int issue_flags)
+{
+ struct io_rsrc_data *data = &ctx->buf_table;
+ struct req_iterator rq_iter;
+ struct io_mapped_ubuf *imu;
+ struct io_rsrc_node *node;
+ int ret = 0, i = 0;
+ struct bio_vec bv;
+ u16 nr_bvecs;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ if (index >= data->nr) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ node = data->nodes[index];
+ if (node) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ if (!node) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ nr_bvecs = blk_rq_nr_phys_segments(rq);
+ imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+ if (!imu) {
+ kfree(node);
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ imu->ubuf = 0;
+ imu->len = blk_rq_bytes(rq);
+ imu->acct_pages = 0;
+ imu->nr_bvecs = nr_bvecs;
+ refcount_set(&imu->refs, 1);
+ imu->release = release;
+ imu->priv = rq;
+
+ rq_for_each_bvec(bv, rq, rq_iter)
+ bvec_set_page(&imu->bvec[i++], bv.bv_page, bv.bv_len,
+ bv.bv_offset);
+
+ node->buf = imu;
+ data->nodes[index] = node;
+unlock:
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
+
+void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index,
+ unsigned int issue_flags)
+{
+ struct io_rsrc_data *data = &ctx->buf_table;
+ struct io_rsrc_node *node;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ if (!data->nr)
+ goto unlock;
+ if (index >= data->nr)
+ goto unlock;
+
+ node = data->nodes[index];
+ if (!node || !node->buf)
+ goto unlock;
+ if (!node->buf->release)
+ goto unlock;
+ io_reset_rsrc_node(ctx, data, index);
+unlock:
+ io_ring_submit_unlock(ctx, issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
+
int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len)
@@ -886,8 +975,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
/*
* Don't use iov_iter_advance() here, as it's really slow for
* using the latter parts of a big fixed buffer - it iterates
- * over each segment manually. We can cheat a bit here, because
- * we know that:
+ * over each segment manually. We can cheat a bit here for user
+ * registered nodes, because we know that:
*
* 1) it's a BVEC iter, we set it up
* 2) all bvecs are the same in size, except potentially the
@@ -901,8 +990,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
*/
const struct bio_vec *bvec = imu->bvec;
+ /*
+ * Kernel buffer bvecs, on the other hand, don't necessarily
+ * have the size property of user registered ones, so we have
+ * to use the slow iter advance.
+ */
if (offset < bvec->bv_len) {
iter->iov_offset = offset;
+ } else if (imu->release) {
+ iov_iter_advance(iter, offset);
} else {
unsigned long seg_skip;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 190f7ee45de93..2e8d1862caefc 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -33,6 +33,8 @@ struct io_mapped_ubuf {
unsigned int folio_shift;
refcount_t refs;
unsigned long acct_pages;
+ void (*release)(void *);
+ void *priv;
struct bio_vec bvec[] __counted_by(nr_bvecs);
};
--
2.43.5
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCHv3 2/5] io_uring: add support for kernel registered bvecs
2025-02-14 15:43 ` [PATCHv3 2/5] io_uring: add support for kernel registered bvecs Keith Busch
@ 2025-02-14 20:38 ` Caleb Sander Mateos
2025-02-18 19:59 ` Keith Busch
0 siblings, 1 reply; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-14 20:38 UTC (permalink / raw)
To: Keith Busch
Cc: ming.lei, asml.silence, axboe, linux-block, io-uring, bernd,
Keith Busch
On Fri, Feb 14, 2025 at 7:45 AM Keith Busch <[email protected]> wrote:
>
> From: Keith Busch <[email protected]>
>
> Provide an interface for the kernel to leverage the existing
> pre-registered buffers that io_uring provides. User space can reference
> these later to achieve zero-copy IO.
>
> User space must register a sparse fixed buffer table with io_uring in
> order for the kernel to make use of it. Kernel users of this interface
> need to register a callback to know when the last reference is released.
> io_uring uses the existence of this callback to differentiate user vs
> kernel register buffers.
>
> Signed-off-by: Keith Busch <[email protected]>
> ---
> include/linux/io_uring.h | 1 +
> include/linux/io_uring_types.h | 6 ++
> io_uring/rsrc.c | 112 ++++++++++++++++++++++++++++++---
> io_uring/rsrc.h | 2 +
> 4 files changed, 113 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index 85fe4e6b275c7..b5637a2aae340 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -5,6 +5,7 @@
> #include <linux/sched.h>
> #include <linux/xarray.h>
> #include <uapi/linux/io_uring.h>
> +#include <linux/blk-mq.h>
>
> #if defined(CONFIG_IO_URING)
> void __io_uring_cancel(bool cancel_all);
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index d5bf336882aa8..b9feba4df60c9 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -696,4 +696,10 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
> return ctx->flags & IORING_SETUP_CQE32;
> }
>
> +int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> + void (*release)(void *), unsigned int index,
> + unsigned int issue_flags);
> +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int tag,
> + unsigned int issue_flags);
Change "tag" to "index" to match the definition?
> +
> #endif
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index af39b69eb4fde..0e323ca1e8e5c 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -103,19 +103,23 @@ static int io_buffer_validate(struct iovec *iov)
>
> static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> {
> - unsigned int i;
> + struct io_mapped_ubuf *imu = node->buf;
>
> - if (node->buf) {
> - struct io_mapped_ubuf *imu = node->buf;
> + if (!refcount_dec_and_test(&imu->refs))
> + return;
> +
> + if (imu->release) {
> + imu->release(imu->priv);
> + } else {
> + unsigned int i;
>
> - if (!refcount_dec_and_test(&imu->refs))
> - return;
> for (i = 0; i < imu->nr_bvecs; i++)
> unpin_user_page(imu->bvec[i].bv_page);
> if (imu->acct_pages)
> io_unaccount_mem(ctx, imu->acct_pages);
> - kvfree(imu);
> }
> +
> + kvfree(imu);
> }
>
> struct io_rsrc_node *io_rsrc_node_alloc(int type)
> @@ -764,6 +768,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> imu->len = iov->iov_len;
> imu->nr_bvecs = nr_pages;
> imu->folio_shift = PAGE_SHIFT;
> + imu->release = NULL;
> + imu->priv = NULL;
> if (coalesced)
> imu->folio_shift = data.folio_shift;
> refcount_set(&imu->refs, 1);
> @@ -860,6 +866,89 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> return ret;
> }
>
> +int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> + void (*release)(void *), unsigned int index,
> + unsigned int issue_flags)
> +{
> + struct io_rsrc_data *data = &ctx->buf_table;
> + struct req_iterator rq_iter;
> + struct io_mapped_ubuf *imu;
> + struct io_rsrc_node *node;
> + int ret = 0, i = 0;
Use an unsigned type for i so it doesn't need to be sign-extended when
used as an array index?
> + struct bio_vec bv;
> + u16 nr_bvecs;
> +
> + io_ring_submit_lock(ctx, issue_flags);
> +
> + if (index >= data->nr) {
> + ret = -EINVAL;
> + goto unlock;
> + }
> +
> + node = data->nodes[index];
> + if (node) {
I think Pavel already suggested using array_index_nospec() since this
index is under userspace control.
Also nit, but don't see the need to store data->nodes[index] in an
intermediate variable.
> + ret = -EBUSY;
> + goto unlock;
> + }
> +
> + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + if (!node) {
> + ret = -ENOMEM;
> + goto unlock;
> + }
> +
> + nr_bvecs = blk_rq_nr_phys_segments(rq);
Is this guaranteed to match the number of bvecs in the request?
Wouldn't the number of physical segments depend on how the block
device splits the bvecs? lo_rw_aio() uses rq_for_each_bvec() to count
the number of bvecs, for example.
> + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> + if (!imu) {
> + kfree(node);
> + ret = -ENOMEM;
> + goto unlock;
> + }
> +
> + imu->ubuf = 0;
> + imu->len = blk_rq_bytes(rq);
> + imu->acct_pages = 0;
> + imu->nr_bvecs = nr_bvecs;
> + refcount_set(&imu->refs, 1);
> + imu->release = release;
> + imu->priv = rq;
Consider initializing imu->folio_shift? I don't think it's used for
kbufs, but neither is acct_pages. One more store to the same cache
line shouldn't be expensive.
> +
> + rq_for_each_bvec(bv, rq, rq_iter)
> + bvec_set_page(&imu->bvec[i++], bv.bv_page, bv.bv_len,
> + bv.bv_offset);
Just imu->bvec[i++] = bv; ?
> +
> + node->buf = imu;
> + data->nodes[index] = node;
> +unlock:
> + io_ring_submit_unlock(ctx, issue_flags);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
> +
> +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index,
> + unsigned int issue_flags)
> +{
> + struct io_rsrc_data *data = &ctx->buf_table;
> + struct io_rsrc_node *node;
> +
> + io_ring_submit_lock(ctx, issue_flags);
> +
> + if (!data->nr)
> + goto unlock;
> + if (index >= data->nr)
> + goto unlock;
If data->nr is 0, index >= data->nr will always be true. So I think
you can get rid of the first if statement.
> +
> + node = data->nodes[index];
I think Pavel suggested using array_index_nospec() here too.
> + if (!node || !node->buf)
> + goto unlock;
Pavel asked how node can node->buf could be NULL if node is not. I
agree it doesn't seem possible based on how how
io_buffer_register_bvec() initializes the nodes.
> + if (!node->buf->release)
> + goto unlock;
Probably could combine this with the if (!node || !node->buf) above.
> + io_reset_rsrc_node(ctx, data, index);
io_reset_rsrc_node() reads data->nodes[index] again. How about just
open-coding the call to io_put_rsrc_node(ctx, node); and setting
data->nodes[index] = NULL; here?
Best,
Caleb
> +unlock:
> + io_ring_submit_unlock(ctx, issue_flags);
> +}
> +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
> +
> int io_import_fixed(int ddir, struct iov_iter *iter,
> struct io_mapped_ubuf *imu,
> u64 buf_addr, size_t len)
> @@ -886,8 +975,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
> /*
> * Don't use iov_iter_advance() here, as it's really slow for
> * using the latter parts of a big fixed buffer - it iterates
> - * over each segment manually. We can cheat a bit here, because
> - * we know that:
> + * over each segment manually. We can cheat a bit here for user
> + * registered nodes, because we know that:
> *
> * 1) it's a BVEC iter, we set it up
> * 2) all bvecs are the same in size, except potentially the
> @@ -901,8 +990,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
> */
> const struct bio_vec *bvec = imu->bvec;
>
> + /*
> + * Kernel buffer bvecs, on the other hand, don't necessarily
> + * have the size property of user registered ones, so we have
> + * to use the slow iter advance.
> + */
> if (offset < bvec->bv_len) {
> iter->iov_offset = offset;
> + } else if (imu->release) {
> + iov_iter_advance(iter, offset);
> } else {
> unsigned long seg_skip;
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 190f7ee45de93..2e8d1862caefc 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -33,6 +33,8 @@ struct io_mapped_ubuf {
> unsigned int folio_shift;
> refcount_t refs;
> unsigned long acct_pages;
> + void (*release)(void *);
> + void *priv;
> struct bio_vec bvec[] __counted_by(nr_bvecs);
> };
>
> --
> 2.43.5
>
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 2/5] io_uring: add support for kernel registered bvecs
2025-02-14 20:38 ` Caleb Sander Mateos
@ 2025-02-18 19:59 ` Keith Busch
2025-02-18 20:20 ` Caleb Sander Mateos
0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2025-02-18 19:59 UTC (permalink / raw)
To: Caleb Sander Mateos
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Fri, Feb 14, 2025 at 12:38:54PM -0800, Caleb Sander Mateos wrote:
> On Fri, Feb 14, 2025 at 7:45 AM Keith Busch <[email protected]> wrote:
> > +
> > + nr_bvecs = blk_rq_nr_phys_segments(rq);
>
> Is this guaranteed to match the number of bvecs in the request?
Yes.
> Wouldn't the number of physical segments depend on how the block
> device splits the bvecs?
Also yes.
>lo_rw_aio() uses rq_for_each_bvec() to count
> the number of bvecs, for example.
Hm, that seems unnecessary. The request's nr_phys_segments is
initialized to the number of bvecs rather than page segments, so it can
be used instead of recounting them from a given struct request.
The initial number of physical segments for a request is set in
bio_split_rw_at(), which uses bio_for_each_bvec(). That's what
rq_for_each_bvec would use, too. The same is used for any bio's that get
merged into the bio.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 2/5] io_uring: add support for kernel registered bvecs
2025-02-18 19:59 ` Keith Busch
@ 2025-02-18 20:20 ` Caleb Sander Mateos
0 siblings, 0 replies; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-18 20:20 UTC (permalink / raw)
To: Keith Busch
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Tue, Feb 18, 2025 at 11:59 AM Keith Busch <[email protected]> wrote:
>
> On Fri, Feb 14, 2025 at 12:38:54PM -0800, Caleb Sander Mateos wrote:
> > On Fri, Feb 14, 2025 at 7:45 AM Keith Busch <[email protected]> wrote:
> > > +
> > > + nr_bvecs = blk_rq_nr_phys_segments(rq);
> >
> > Is this guaranteed to match the number of bvecs in the request?
>
> Yes.
>
> > Wouldn't the number of physical segments depend on how the block
> > device splits the bvecs?
>
> Also yes.
>
> >lo_rw_aio() uses rq_for_each_bvec() to count
> > the number of bvecs, for example.
>
> Hm, that seems unnecessary. The request's nr_phys_segments is
> initialized to the number of bvecs rather than page segments, so it can
> be used instead of recounting them from a given struct request.
>
> The initial number of physical segments for a request is set in
> bio_split_rw_at(), which uses bio_for_each_bvec(). That's what
> rq_for_each_bvec would use, too. The same is used for any bio's that get
> merged into the bio.
Okay, thanks for verifying!
Best,
Caleb
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCHv3 3/5] ublk: zc register/unregister bvec
2025-02-14 15:43 [PATCHv3 0/5] ublk zero-copy support Keith Busch
2025-02-14 15:43 ` [PATCHv3 1/5] io_uring: move fixed buffer import to issue path Keith Busch
2025-02-14 15:43 ` [PATCHv3 2/5] io_uring: add support for kernel registered bvecs Keith Busch
@ 2025-02-14 15:43 ` Keith Busch
2025-02-14 15:43 ` [PATCHv3 4/5] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-14 15:43 ` [PATCHv3 5/5] io_uring: cache nodes and mapped buffers Keith Busch
4 siblings, 0 replies; 17+ messages in thread
From: Keith Busch @ 2025-02-14 15:43 UTC (permalink / raw)
To: ming.lei, asml.silence, axboe, linux-block, io-uring; +Cc: bernd, Keith Busch
From: Keith Busch <[email protected]>
Provide new operations for the user to request mapping an active request
to an io uring instance's buf_table. The user has to provide the index
it wants to install the buffer.
A reference count is taken on the request to ensure it can't be
completed while it is active in a ring's buf_table.
Signed-off-by: Keith Busch <[email protected]>
---
drivers/block/ublk_drv.c | 137 +++++++++++++++++++++++++---------
include/uapi/linux/ublk_cmd.h | 4 +
2 files changed, 105 insertions(+), 36 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 529085181f355..0c753176b14e9 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -51,6 +51,9 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
+#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
+
/* All UBLK_F_* have to be included into UBLK_F_ALL */
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
| UBLK_F_URING_CMD_COMP_IN_TASK \
@@ -201,7 +204,7 @@ static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
- return ub->dev_info.flags & UBLK_F_USER_COPY;
+ return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
}
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
@@ -581,7 +584,7 @@ static void ublk_apply_params(struct ublk_device *ub)
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
- return ubq->flags & UBLK_F_USER_COPY;
+ return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -1747,6 +1750,96 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
+ struct ublk_queue *ubq, int tag, size_t offset)
+{
+ struct request *req;
+
+ if (!ublk_need_req_ref(ubq))
+ return NULL;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req)
+ return NULL;
+
+ if (!ublk_get_req_ref(ubq, req))
+ return NULL;
+
+ if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
+ goto fail_put;
+
+ if (!ublk_rq_has_data(req))
+ goto fail_put;
+
+ if (offset > blk_rq_bytes(req))
+ goto fail_put;
+
+ return req;
+fail_put:
+ ublk_put_req_ref(ubq, req);
+ return NULL;
+}
+
+static void ublk_io_release(void *priv)
+{
+ struct request *rq = priv;
+ struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+
+ ublk_put_req_ref(ubq, rq);
+}
+
+static int ublk_register_io_buf(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag,
+ const struct ublksrv_io_cmd *ub_cmd,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct ublk_device *ub = cmd->file->private_data;
+ int index = (int)ub_cmd->addr, ret;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ req = __ublk_check_and_get_req(ub, ubq, tag, 0);
+ if (!req)
+ return -EINVAL;
+
+ data = blk_mq_rq_to_pdu(req);
+ ret = io_buffer_register_bvec(ctx, req, ublk_io_release, index,
+ issue_flags);
+ if (ret) {
+ ublk_put_req_ref(ubq, req);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag,
+ const struct ublksrv_io_cmd *ub_cmd,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct ublk_device *ub = cmd->file->private_data;
+ int index = (int)ub_cmd->addr;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req)
+ return -EINVAL;
+
+ data = blk_mq_rq_to_pdu(req);
+ io_buffer_unregister_bvec(ctx, index, issue_flags);
+ return 0;
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1798,6 +1891,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
+ case UBLK_IO_REGISTER_IO_BUF:
+ return ublk_register_io_buf(cmd, ubq, tag, ub_cmd, issue_flags);
+ case UBLK_IO_UNREGISTER_IO_BUF:
+ return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd,
+ issue_flags);
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
@@ -1872,36 +1970,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
return -EIOCBQUEUED;
}
-static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- struct ublk_queue *ubq, int tag, size_t offset)
-{
- struct request *req;
-
- if (!ublk_need_req_ref(ubq))
- return NULL;
-
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
- if (!req)
- return NULL;
-
- if (!ublk_get_req_ref(ubq, req))
- return NULL;
-
- if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
- goto fail_put;
-
- if (!ublk_rq_has_data(req))
- goto fail_put;
-
- if (offset > blk_rq_bytes(req))
- goto fail_put;
-
- return req;
-fail_put:
- ublk_put_req_ref(ubq, req);
- return NULL;
-}
-
static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -2527,9 +2595,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_free_dev_number;
}
- /* We are not ready to support zero copy */
- ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
-
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
@@ -2860,7 +2925,7 @@ static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
{
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
- u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
+ u64 features = UBLK_F_ALL;
if (header->len != UBLK_FEATURES_LEN || !header->addr)
return -EINVAL;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a8bc98bb69fce..74246c926b55f 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -94,6 +94,10 @@
_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
#define UBLK_U_IO_NEED_GET_DATA \
_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+#define UBLK_U_IO_REGISTER_IO_BUF \
+ _IOWR('u', 0x23, struct ublksrv_io_cmd)
+#define UBLK_U_IO_UNREGISTER_IO_BUF \
+ _IOWR('u', 0x24, struct ublksrv_io_cmd)
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
--
2.43.5
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCHv3 4/5] io_uring: add abstraction for buf_table rsrc data
2025-02-14 15:43 [PATCHv3 0/5] ublk zero-copy support Keith Busch
` (2 preceding siblings ...)
2025-02-14 15:43 ` [PATCHv3 3/5] ublk: zc register/unregister bvec Keith Busch
@ 2025-02-14 15:43 ` Keith Busch
2025-02-14 15:43 ` [PATCHv3 5/5] io_uring: cache nodes and mapped buffers Keith Busch
4 siblings, 0 replies; 17+ messages in thread
From: Keith Busch @ 2025-02-14 15:43 UTC (permalink / raw)
To: ming.lei, asml.silence, axboe, linux-block, io-uring; +Cc: bernd, Keith Busch
From: Keith Busch <[email protected]>
We'll need to add more fields specific to the registered buffers, so
make a layer for it now. No functional change in this patch.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring_types.h | 6 ++++-
io_uring/fdinfo.c | 8 +++---
io_uring/register.c | 2 +-
io_uring/rsrc.c | 47 +++++++++++++++++-----------------
4 files changed, 33 insertions(+), 30 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b9feba4df60c9..d8d717cce427f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,6 +67,10 @@ struct io_file_table {
unsigned int alloc_hint;
};
+struct io_buf_table {
+ struct io_rsrc_data data;
+};
+
struct io_hash_bucket {
struct hlist_head list;
} ____cacheline_aligned_in_smp;
@@ -291,7 +295,7 @@ struct io_ring_ctx {
struct io_wq_work_list iopoll_list;
struct io_file_table file_table;
- struct io_rsrc_data buf_table;
+ struct io_buf_table buf_table;
struct io_submit_state submit_state;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index f60d0a9d505e2..d389c06cbce10 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -217,12 +217,12 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_puts(m, "\n");
}
}
- seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
- for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+ seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.data.nr);
+ for (i = 0; has_lock && i < ctx->buf_table.data.nr; i++) {
struct io_mapped_ubuf *buf = NULL;
- if (ctx->buf_table.nodes[i])
- buf = ctx->buf_table.nodes[i]->buf;
+ if (ctx->buf_table.data.nodes[i])
+ buf = ctx->buf_table.data.nodes[i]->buf;
if (buf)
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
else
diff --git a/io_uring/register.c b/io_uring/register.c
index 9a4d2fbce4aec..fa922b1b26583 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -919,7 +919,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ret = __io_uring_register(ctx, opcode, arg, nr_args);
trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
- ctx->buf_table.nr, ret);
+ ctx->buf_table.data.nr, ret);
mutex_unlock(&ctx->uring_lock);
fput(file);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 0e323ca1e8e5c..fd7a1b04db8b7 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -235,9 +235,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
__u32 done;
int i, err;
- if (!ctx->buf_table.nr)
+ if (!ctx->buf_table.data.nr)
return -ENXIO;
- if (up->offset + nr_args > ctx->buf_table.nr)
+ if (up->offset + nr_args > ctx->buf_table.data.nr)
return -EINVAL;
for (done = 0; done < nr_args; done++) {
@@ -270,8 +270,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
node->tag = tag;
}
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
- io_reset_rsrc_node(ctx, &ctx->buf_table, i);
- ctx->buf_table.nodes[i] = node;
+ io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
+ ctx->buf_table.data.nodes[i] = node;
if (ctx->compat)
user_data += sizeof(struct compat_iovec);
else
@@ -549,9 +549,9 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
- if (!ctx->buf_table.nr)
+ if (!ctx->buf_table.data.nr)
return -ENXIO;
- io_rsrc_data_free(ctx, &ctx->buf_table);
+ io_rsrc_data_free(ctx, &ctx->buf_table.data);
return 0;
}
@@ -578,8 +578,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
}
/* check previously registered pages */
- for (i = 0; i < ctx->buf_table.nr; i++) {
- struct io_rsrc_node *node = ctx->buf_table.nodes[i];
+ for (i = 0; i < ctx->buf_table.data.nr; i++) {
+ struct io_rsrc_node *node = ctx->buf_table.data.nodes[i];
struct io_mapped_ubuf *imu;
if (!node)
@@ -807,7 +807,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
- if (ctx->buf_table.nr)
+ if (ctx->buf_table.data.nr)
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
@@ -860,7 +860,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
data.nodes[i] = node;
}
- ctx->buf_table = data;
+ ctx->buf_table.data = data;
if (ret)
io_sqe_buffers_unregister(ctx);
return ret;
@@ -870,7 +870,7 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags)
{
- struct io_rsrc_data *data = &ctx->buf_table;
+ struct io_rsrc_data *data = &ctx->buf_table.data;
struct req_iterator rq_iter;
struct io_mapped_ubuf *imu;
struct io_rsrc_node *node;
@@ -928,7 +928,7 @@ EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index,
unsigned int issue_flags)
{
- struct io_rsrc_data *data = &ctx->buf_table;
+ struct io_rsrc_data *data = &ctx->buf_table.data;
struct io_rsrc_node *node;
io_ring_submit_lock(ctx, issue_flags);
@@ -1046,10 +1046,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (!arg->nr && (arg->dst_off || arg->src_off))
return -EINVAL;
/* not allowed unless REPLACE is set */
- if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
+ if (ctx->buf_table.data.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
return -EBUSY;
- nbufs = src_ctx->buf_table.nr;
+ nbufs = src_ctx->buf_table.data.nr;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
@@ -1059,13 +1059,13 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
- ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
+ ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
if (ret)
return ret;
/* Fill entries in data from dst that won't overlap with src */
- for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
- struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+ for (i = 0; i < min(arg->dst_off, ctx->buf_table.data.nr); i++) {
+ struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
if (src_node) {
data.nodes[i] = src_node;
@@ -1074,7 +1074,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
}
ret = -ENXIO;
- nbufs = src_ctx->buf_table.nr;
+ nbufs = src_ctx->buf_table.data.nr;
if (!nbufs)
goto out_free;
ret = -EINVAL;
@@ -1094,7 +1094,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
while (nr--) {
struct io_rsrc_node *dst_node, *src_node;
- src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
+ src_node = io_rsrc_node_lookup(&src_ctx->buf_table.data, i);
if (!src_node) {
dst_node = NULL;
} else {
@@ -1116,7 +1116,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* old and new nodes at this point.
*/
if (arg->flags & IORING_REGISTER_DST_REPLACE)
- io_rsrc_data_free(ctx, &ctx->buf_table);
+ io_sqe_buffers_unregister(ctx);
/*
* ctx->buf_table must be empty now - either the contents are being
@@ -1124,10 +1124,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* copied to a ring that does not have buffers yet (checked at function
* entry).
*/
- WARN_ON_ONCE(ctx->buf_table.nr);
- ctx->buf_table = data;
+ WARN_ON_ONCE(ctx->buf_table.data.nr);
+ ctx->buf_table.data = data;
return 0;
-
out_free:
io_rsrc_data_free(ctx, &data);
return ret;
@@ -1152,7 +1151,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
return -EFAULT;
if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
return -EINVAL;
- if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
+ if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.data.nr)
return -EBUSY;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
--
2.43.5
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-14 15:43 [PATCHv3 0/5] ublk zero-copy support Keith Busch
` (3 preceding siblings ...)
2025-02-14 15:43 ` [PATCHv3 4/5] io_uring: add abstraction for buf_table rsrc data Keith Busch
@ 2025-02-14 15:43 ` Keith Busch
2025-02-15 2:22 ` Caleb Sander Mateos
4 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2025-02-14 15:43 UTC (permalink / raw)
To: ming.lei, asml.silence, axboe, linux-block, io-uring; +Cc: bernd, Keith Busch
From: Keith Busch <[email protected]>
Frequent alloc/free cycles on these is pretty costly. Use an io cache to
more efficiently reuse these buffers.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring_types.h | 18 +++---
io_uring/filetable.c | 2 +-
io_uring/rsrc.c | 114 +++++++++++++++++++++++++--------
io_uring/rsrc.h | 2 +-
4 files changed, 99 insertions(+), 37 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d8d717cce427f..ebaaa1c7e210f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,8 +67,18 @@ struct io_file_table {
unsigned int alloc_hint;
};
+struct io_alloc_cache {
+ void **entries;
+ unsigned int nr_cached;
+ unsigned int max_cached;
+ size_t elem_size;
+ unsigned int init_clear;
+};
+
struct io_buf_table {
struct io_rsrc_data data;
+ struct io_alloc_cache node_cache;
+ struct io_alloc_cache imu_cache;
};
struct io_hash_bucket {
@@ -222,14 +232,6 @@ struct io_submit_state {
struct blk_plug plug;
};
-struct io_alloc_cache {
- void **entries;
- unsigned int nr_cached;
- unsigned int max_cached;
- unsigned int elem_size;
- unsigned int init_clear;
-};
-
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index dd8eeec97acf6..a21660e3145ab 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
if (slot_index >= ctx->file_table.data.nr)
return -EINVAL;
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node)
return -ENOMEM;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fd7a1b04db8b7..26ff9b5851d94 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)
+#define IO_CACHED_BVECS_SEGS 32
+
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
kvfree(imu);
}
-struct io_rsrc_node *io_rsrc_node_alloc(int type)
+
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
{
struct io_rsrc_node *node;
- node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (type == IORING_RSRC_FILE)
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ else
+ node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
if (node) {
node->type = type;
node->refs = 1;
+ node->tag = 0;
+ node->file_ptr = 0;
}
return node;
}
-__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
+static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
+{
+ kvfree(data->nodes);
+ data->nodes = NULL;
+ data->nr = 0;
+}
+
+__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
+ struct io_rsrc_data *data)
{
if (!data->nr)
return;
@@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
if (data->nodes[data->nr])
io_put_rsrc_node(ctx, data->nodes[data->nr]);
}
- kvfree(data->nodes);
- data->nodes = NULL;
- data->nr = 0;
+ __io_rsrc_data_free(data);
}
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
@@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
return -ENOMEM;
}
+static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
+{
+ const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
+ IO_CACHED_BVECS_SEGS);
+ int ret;
+
+ ret = io_rsrc_data_alloc(&table->data, nr);
+ if (ret)
+ return ret;
+
+ ret = io_alloc_cache_init(&table->node_cache, nr,
+ sizeof(struct io_rsrc_node), 0);
+ if (ret)
+ goto out_1;
+
+ ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
+ if (ret)
+ goto out_2;
+
+ return 0;
+out_2:
+ io_alloc_cache_free(&table->node_cache, kfree);
+out_1:
+ __io_rsrc_data_free(&table->data);
+ return ret;
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
@@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
err = -ENOMEM;
fput(file);
@@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
}
node->tag = tag;
}
- i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+ i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
ctx->buf_table.data.nodes[i] = node;
if (ctx->compat)
@@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
case IORING_RSRC_BUFFER:
if (node->buf)
io_buffer_unmap(ctx, node);
+ if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
+ return;
break;
default:
WARN_ON_ONCE(1);
@@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto fail;
}
ret = -ENOMEM;
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
fput(file);
goto fail;
@@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
+ struct io_buf_table *table)
+{
+ io_rsrc_data_free(ctx, &table->data);
+ io_alloc_cache_free(&table->node_cache, kfree);
+ io_alloc_cache_free(&table->imu_cache, kfree);
+}
+
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
if (!ctx->buf_table.data.nr)
return -ENXIO;
- io_rsrc_data_free(ctx, &ctx->buf_table.data);
+ io_rsrc_buffer_free(ctx, &ctx->buf_table);
return 0;
}
@@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
return true;
}
+static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
+ int nr_bvecs)
+{
+ if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
+ return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
+ return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
+ GFP_KERNEL);
+}
+
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
struct iovec *iov,
struct page **last_hpage)
@@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
if (!iov->iov_base)
return NULL;
- node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!node)
return ERR_PTR(-ENOMEM);
node->buf = NULL;
@@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
}
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ imu = io_alloc_imu(ctx, nr_pages);
if (!imu)
goto done;
@@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags)
{
struct page *last_hpage = NULL;
- struct io_rsrc_data data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec;
+ struct io_buf_table table;
int i, ret;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
- ret = io_rsrc_data_alloc(&data, nr_args);
+ ret = io_rsrc_buffer_alloc(&table, nr_args);
if (ret)
return ret;
if (!arg)
memset(iov, 0, sizeof(*iov));
+ ctx->buf_table = table;
for (i = 0; i < nr_args; i++) {
struct io_rsrc_node *node;
u64 tag = 0;
@@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
node->tag = tag;
}
- data.nodes[i] = node;
+ table.data.nodes[i] = node;
}
-
- ctx->buf_table.data = data;
if (ret)
io_sqe_buffers_unregister(ctx);
return ret;
@@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
goto unlock;
}
- node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!node) {
ret = -ENOMEM;
goto unlock;
}
nr_bvecs = blk_rq_nr_phys_segments(rq);
- imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+
+ imu = io_alloc_imu(ctx, nr_bvecs);
if (!imu) {
kfree(node);
ret = -ENOMEM;
@@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
struct io_uring_clone_buffers *arg)
{
- struct io_rsrc_data data;
+ struct io_buf_table table;
int i, ret, off, nr;
unsigned int nbufs;
@@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
- ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
+ ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
if (ret)
return ret;
@@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
if (src_node) {
- data.nodes[i] = src_node;
+ table.data.nodes[i] = src_node;
src_node->refs++;
}
}
@@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (!src_node) {
dst_node = NULL;
} else {
- dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!dst_node) {
ret = -ENOMEM;
goto out_free;
@@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
refcount_inc(&src_node->buf->refs);
dst_node->buf = src_node->buf;
}
- data.nodes[off++] = dst_node;
+ table.data.nodes[off++] = dst_node;
i++;
}
/*
- * If asked for replace, put the old table. data->nodes[] holds both
+ * If asked for replace, put the old table. table.data->nodes[] holds both
* old and new nodes at this point.
*/
if (arg->flags & IORING_REGISTER_DST_REPLACE)
@@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* entry).
*/
WARN_ON_ONCE(ctx->buf_table.data.nr);
- ctx->buf_table.data = data;
+ ctx->buf_table = table;
return 0;
out_free:
- io_rsrc_data_free(ctx, &data);
+ io_rsrc_buffer_free(ctx, &table);
return ret;
}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 2e8d1862caefc..c5bdac558a2b4 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -47,7 +47,7 @@ struct io_imu_folio_data {
unsigned int nr_folios;
};
-struct io_rsrc_node *io_rsrc_node_alloc(int type);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
--
2.43.5
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-14 15:43 ` [PATCHv3 5/5] io_uring: cache nodes and mapped buffers Keith Busch
@ 2025-02-15 2:22 ` Caleb Sander Mateos
2025-02-16 22:43 ` Caleb Sander Mateos
2025-02-18 20:09 ` Keith Busch
0 siblings, 2 replies; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-15 2:22 UTC (permalink / raw)
To: Keith Busch
Cc: ming.lei, asml.silence, axboe, linux-block, io-uring, bernd,
Keith Busch
On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <[email protected]> wrote:
>
> From: Keith Busch <[email protected]>
>
> Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> more efficiently reuse these buffers.
>
> Signed-off-by: Keith Busch <[email protected]>
> ---
> include/linux/io_uring_types.h | 18 +++---
> io_uring/filetable.c | 2 +-
> io_uring/rsrc.c | 114 +++++++++++++++++++++++++--------
> io_uring/rsrc.h | 2 +-
> 4 files changed, 99 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index d8d717cce427f..ebaaa1c7e210f 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -67,8 +67,18 @@ struct io_file_table {
> unsigned int alloc_hint;
> };
>
> +struct io_alloc_cache {
> + void **entries;
> + unsigned int nr_cached;
> + unsigned int max_cached;
> + size_t elem_size;
> + unsigned int init_clear;
> +};
> +
> struct io_buf_table {
> struct io_rsrc_data data;
> + struct io_alloc_cache node_cache;
> + struct io_alloc_cache imu_cache;
> };
>
> struct io_hash_bucket {
> @@ -222,14 +232,6 @@ struct io_submit_state {
> struct blk_plug plug;
> };
>
> -struct io_alloc_cache {
> - void **entries;
> - unsigned int nr_cached;
> - unsigned int max_cached;
> - unsigned int elem_size;
> - unsigned int init_clear;
> -};
> -
> struct io_ring_ctx {
> /* const or read-mostly hot data */
> struct {
> diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> index dd8eeec97acf6..a21660e3145ab 100644
> --- a/io_uring/filetable.c
> +++ b/io_uring/filetable.c
> @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
> if (slot_index >= ctx->file_table.data.nr)
> return -EINVAL;
>
> - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> if (!node)
> return -ENOMEM;
>
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index fd7a1b04db8b7..26ff9b5851d94 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> #define IORING_MAX_FIXED_FILES (1U << 20)
> #define IORING_MAX_REG_BUFFERS (1U << 14)
>
> +#define IO_CACHED_BVECS_SEGS 32
> +
> int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
> {
> unsigned long page_limit, cur_pages, new_pages;
> @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> kvfree(imu);
> }
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> +
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
nit: extra blank line added here
> {
> struct io_rsrc_node *node;
>
> - node = kzalloc(sizeof(*node), GFP_KERNEL);
> + if (type == IORING_RSRC_FILE)
> + node = kmalloc(sizeof(*node), GFP_KERNEL);
> + else
> + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
> if (node) {
> node->type = type;
> node->refs = 1;
> + node->tag = 0;
> + node->file_ptr = 0;
> }
> return node;
> }
>
> -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> +{
> + kvfree(data->nodes);
> + data->nodes = NULL;
> + data->nr = 0;
> +}
> +
> +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> + struct io_rsrc_data *data)
> {
> if (!data->nr)
> return;
> @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
> if (data->nodes[data->nr])
> io_put_rsrc_node(ctx, data->nodes[data->nr]);
> }
> - kvfree(data->nodes);
> - data->nodes = NULL;
> - data->nr = 0;
> + __io_rsrc_data_free(data);
> }
>
> __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> return -ENOMEM;
> }
>
> +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> +{
> + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> + IO_CACHED_BVECS_SEGS);
> + int ret;
> +
> + ret = io_rsrc_data_alloc(&table->data, nr);
> + if (ret)
> + return ret;
> +
> + ret = io_alloc_cache_init(&table->node_cache, nr,
> + sizeof(struct io_rsrc_node), 0);
> + if (ret)
> + goto out_1;
> +
> + ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
> + if (ret)
> + goto out_2;
io_alloc_cache_init() returns bool. Probably these cases should return
-ENOMEM instead of 1?
> +
> + return 0;
> +out_2:
> + io_alloc_cache_free(&table->node_cache, kfree);
> +out_1:
> + __io_rsrc_data_free(&table->data);
> + return ret;
> +}
> +
> static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> struct io_uring_rsrc_update2 *up,
> unsigned nr_args)
> @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> err = -EBADF;
> break;
> }
> - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> if (!node) {
> err = -ENOMEM;
> fput(file);
> @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> }
> node->tag = tag;
> }
> - i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
> + i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
Looks like this change belongs in the prior patch "io_uring: add
abstraction for buf_table rsrc data"?
> io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
> ctx->buf_table.data.nodes[i] = node;
> if (ctx->compat)
> @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> case IORING_RSRC_BUFFER:
> if (node->buf)
> io_buffer_unmap(ctx, node);
> + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> + return;
> break;
> default:
> WARN_ON_ONCE(1);
> @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> goto fail;
> }
> ret = -ENOMEM;
> - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> if (!node) {
> fput(file);
> goto fail;
> @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> return ret;
> }
>
> +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> + struct io_buf_table *table)
> +{
> + io_rsrc_data_free(ctx, &table->data);
> + io_alloc_cache_free(&table->node_cache, kfree);
> + io_alloc_cache_free(&table->imu_cache, kfree);
> +}
> +
> int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> {
> if (!ctx->buf_table.data.nr)
> return -ENXIO;
> - io_rsrc_data_free(ctx, &ctx->buf_table.data);
> + io_rsrc_buffer_free(ctx, &ctx->buf_table);
> return 0;
> }
>
> @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> return true;
> }
>
> +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> + int nr_bvecs)
> +{
> + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
If there is no entry available in the cache, this will heap-allocate
one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
using io_alloc_cache_get() instead of io_cache_alloc(), so the
heap-allocated fallback uses the minimal size.
Also, where are these allocations returned to the imu_cache? Looks
like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
needs to try io_alloc_cache_put() first.
Best,
Caleb
> + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> + GFP_KERNEL);
> +}
> +
> static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> struct iovec *iov,
> struct page **last_hpage)
> @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> if (!iov->iov_base)
> return NULL;
>
> - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> if (!node)
> return ERR_PTR(-ENOMEM);
> node->buf = NULL;
> @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
> }
>
> - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> + imu = io_alloc_imu(ctx, nr_pages);
> if (!imu)
> goto done;
>
> @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> unsigned int nr_args, u64 __user *tags)
> {
> struct page *last_hpage = NULL;
> - struct io_rsrc_data data;
> struct iovec fast_iov, *iov = &fast_iov;
> const struct iovec __user *uvec;
> + struct io_buf_table table;
> int i, ret;
>
> BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> return -EBUSY;
> if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
> return -EINVAL;
> - ret = io_rsrc_data_alloc(&data, nr_args);
> + ret = io_rsrc_buffer_alloc(&table, nr_args);
> if (ret)
> return ret;
>
> if (!arg)
> memset(iov, 0, sizeof(*iov));
>
> + ctx->buf_table = table;
> for (i = 0; i < nr_args; i++) {
> struct io_rsrc_node *node;
> u64 tag = 0;
> @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> }
> node->tag = tag;
> }
> - data.nodes[i] = node;
> + table.data.nodes[i] = node;
> }
> -
> - ctx->buf_table.data = data;
Is it necessary to move this assignment? I found the existing location
easier to reason about, since the assignment of ctx->buf_table
represents a transfer of ownership from the local variable.
> if (ret)
> io_sqe_buffers_unregister(ctx);
> return ret;
> @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> goto unlock;
> }
>
> - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> if (!node) {
> ret = -ENOMEM;
> goto unlock;
> }
>
> nr_bvecs = blk_rq_nr_phys_segments(rq);
> - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> +
> + imu = io_alloc_imu(ctx, nr_bvecs);
> if (!imu) {
> kfree(node);
> ret = -ENOMEM;
> @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
> static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> struct io_uring_clone_buffers *arg)
> {
> - struct io_rsrc_data data;
> + struct io_buf_table table;
> int i, ret, off, nr;
> unsigned int nbufs;
>
> @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
> return -EOVERFLOW;
>
> - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
> if (ret)
> return ret;
>
> @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
>
> if (src_node) {
> - data.nodes[i] = src_node;
> + table.data.nodes[i] = src_node;
> src_node->refs++;
> }
> }
> @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> if (!src_node) {
> dst_node = NULL;
> } else {
> - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> if (!dst_node) {
> ret = -ENOMEM;
> goto out_free;
> @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> refcount_inc(&src_node->buf->refs);
> dst_node->buf = src_node->buf;
> }
> - data.nodes[off++] = dst_node;
> + table.data.nodes[off++] = dst_node;
> i++;
> }
>
> /*
> - * If asked for replace, put the old table. data->nodes[] holds both
> + * If asked for replace, put the old table. table.data->nodes[] holds both
> * old and new nodes at this point.
> */
> if (arg->flags & IORING_REGISTER_DST_REPLACE)
> @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> * entry).
> */
> WARN_ON_ONCE(ctx->buf_table.data.nr);
> - ctx->buf_table.data = data;
> + ctx->buf_table = table;
> return 0;
> out_free:
> - io_rsrc_data_free(ctx, &data);
> + io_rsrc_buffer_free(ctx, &table);
> return ret;
> }
>
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> index 2e8d1862caefc..c5bdac558a2b4 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -47,7 +47,7 @@ struct io_imu_folio_data {
> unsigned int nr_folios;
> };
>
> -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
> void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
> void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
> int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> --
> 2.43.5
>
>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-15 2:22 ` Caleb Sander Mateos
@ 2025-02-16 22:43 ` Caleb Sander Mateos
2025-02-18 20:12 ` Keith Busch
2025-02-18 20:09 ` Keith Busch
1 sibling, 1 reply; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-16 22:43 UTC (permalink / raw)
To: Keith Busch
Cc: ming.lei, asml.silence, axboe, linux-block, io-uring, bernd,
Keith Busch
On Fri, Feb 14, 2025 at 6:22 PM Caleb Sander Mateos
<[email protected]> wrote:
>
> On Fri, Feb 14, 2025 at 7:46 AM Keith Busch <[email protected]> wrote:
> >
> > From: Keith Busch <[email protected]>
> >
> > Frequent alloc/free cycles on these is pretty costly. Use an io cache to
> > more efficiently reuse these buffers.
> >
> > Signed-off-by: Keith Busch <[email protected]>
> > ---
> > include/linux/io_uring_types.h | 18 +++---
> > io_uring/filetable.c | 2 +-
> > io_uring/rsrc.c | 114 +++++++++++++++++++++++++--------
> > io_uring/rsrc.h | 2 +-
> > 4 files changed, 99 insertions(+), 37 deletions(-)
> >
> > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > index d8d717cce427f..ebaaa1c7e210f 100644
> > --- a/include/linux/io_uring_types.h
> > +++ b/include/linux/io_uring_types.h
> > @@ -67,8 +67,18 @@ struct io_file_table {
> > unsigned int alloc_hint;
> > };
> >
> > +struct io_alloc_cache {
> > + void **entries;
> > + unsigned int nr_cached;
> > + unsigned int max_cached;
> > + size_t elem_size;
> > + unsigned int init_clear;
> > +};
> > +
> > struct io_buf_table {
> > struct io_rsrc_data data;
> > + struct io_alloc_cache node_cache;
> > + struct io_alloc_cache imu_cache;
> > };
> >
> > struct io_hash_bucket {
> > @@ -222,14 +232,6 @@ struct io_submit_state {
> > struct blk_plug plug;
> > };
> >
> > -struct io_alloc_cache {
> > - void **entries;
> > - unsigned int nr_cached;
> > - unsigned int max_cached;
> > - unsigned int elem_size;
> > - unsigned int init_clear;
> > -};
> > -
> > struct io_ring_ctx {
> > /* const or read-mostly hot data */
> > struct {
> > diff --git a/io_uring/filetable.c b/io_uring/filetable.c
> > index dd8eeec97acf6..a21660e3145ab 100644
> > --- a/io_uring/filetable.c
> > +++ b/io_uring/filetable.c
> > @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
> > if (slot_index >= ctx->file_table.data.nr)
> > return -EINVAL;
> >
> > - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> > if (!node)
> > return -ENOMEM;
> >
> > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> > index fd7a1b04db8b7..26ff9b5851d94 100644
> > --- a/io_uring/rsrc.c
> > +++ b/io_uring/rsrc.c
> > @@ -32,6 +32,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > #define IORING_MAX_FIXED_FILES (1U << 20)
> > #define IORING_MAX_REG_BUFFERS (1U << 14)
> >
> > +#define IO_CACHED_BVECS_SEGS 32
> > +
> > int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
> > {
> > unsigned long page_limit, cur_pages, new_pages;
> > @@ -122,19 +124,33 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> > kvfree(imu);
> > }
> >
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type)
> > +
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
>
> nit: extra blank line added here
>
> > {
> > struct io_rsrc_node *node;
> >
> > - node = kzalloc(sizeof(*node), GFP_KERNEL);
> > + if (type == IORING_RSRC_FILE)
> > + node = kmalloc(sizeof(*node), GFP_KERNEL);
> > + else
> > + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL);
> > if (node) {
> > node->type = type;
> > node->refs = 1;
> > + node->tag = 0;
> > + node->file_ptr = 0;
> > }
> > return node;
> > }
> >
> > -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
> > +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
> > +{
> > + kvfree(data->nodes);
> > + data->nodes = NULL;
> > + data->nr = 0;
> > +}
> > +
> > +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
> > + struct io_rsrc_data *data)
> > {
> > if (!data->nr)
> > return;
> > @@ -142,9 +158,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
> > if (data->nodes[data->nr])
> > io_put_rsrc_node(ctx, data->nodes[data->nr]);
> > }
> > - kvfree(data->nodes);
> > - data->nodes = NULL;
> > - data->nr = 0;
> > + __io_rsrc_data_free(data);
> > }
> >
> > __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> > @@ -158,6 +172,33 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
> > return -ENOMEM;
> > }
> >
> > +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
> > +{
> > + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
> > + IO_CACHED_BVECS_SEGS);
> > + int ret;
> > +
> > + ret = io_rsrc_data_alloc(&table->data, nr);
> > + if (ret)
> > + return ret;
> > +
> > + ret = io_alloc_cache_init(&table->node_cache, nr,
> > + sizeof(struct io_rsrc_node), 0);
> > + if (ret)
> > + goto out_1;
> > +
> > + ret = io_alloc_cache_init(&table->imu_cache, nr, imu_cache_size, 0);
> > + if (ret)
> > + goto out_2;
>
> io_alloc_cache_init() returns bool. Probably these cases should return
> -ENOMEM instead of 1?
>
> > +
> > + return 0;
> > +out_2:
> > + io_alloc_cache_free(&table->node_cache, kfree);
> > +out_1:
> > + __io_rsrc_data_free(&table->data);
> > + return ret;
> > +}
> > +
> > static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> > struct io_uring_rsrc_update2 *up,
> > unsigned nr_args)
> > @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
> > err = -EBADF;
> > break;
> > }
> > - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> > if (!node) {
> > err = -ENOMEM;
> > fput(file);
> > @@ -269,7 +310,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
> > }
> > node->tag = tag;
> > }
> > - i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
> > + i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
>
> Looks like this change belongs in the prior patch "io_uring: add
> abstraction for buf_table rsrc data"?
>
> > io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
> > ctx->buf_table.data.nodes[i] = node;
> > if (ctx->compat)
> > @@ -459,6 +500,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
> > case IORING_RSRC_BUFFER:
> > if (node->buf)
> > io_buffer_unmap(ctx, node);
> > + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
> > + return;
> > break;
> > default:
> > WARN_ON_ONCE(1);
> > @@ -527,7 +570,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> > goto fail;
> > }
> > ret = -ENOMEM;
> > - node = io_rsrc_node_alloc(IORING_RSRC_FILE);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
> > if (!node) {
> > fput(file);
> > goto fail;
> > @@ -547,11 +590,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> > return ret;
> > }
> >
> > +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
> > + struct io_buf_table *table)
> > +{
> > + io_rsrc_data_free(ctx, &table->data);
> > + io_alloc_cache_free(&table->node_cache, kfree);
> > + io_alloc_cache_free(&table->imu_cache, kfree);
> > +}
> > +
> > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> > {
> > if (!ctx->buf_table.data.nr)
> > return -ENXIO;
> > - io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > + io_rsrc_buffer_free(ctx, &ctx->buf_table);
> > return 0;
> > }
> >
> > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> > return true;
> > }
> >
> > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > + int nr_bvecs)
> > +{
> > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
>
> If there is no entry available in the cache, this will heap-allocate
> one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> using io_alloc_cache_get() instead of io_cache_alloc(), so the
> heap-allocated fallback uses the minimal size.
>
> Also, where are these allocations returned to the imu_cache? Looks
> like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> needs to try io_alloc_cache_put() first.
Another issue I see is that io_alloc_cache elements are allocated with
kmalloc(), so they can't be freed with kvfree(). When the imu is
freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().
>
> Best,
> Caleb
>
> > + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
> > + GFP_KERNEL);
> > +}
> > +
> > static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > struct iovec *iov,
> > struct page **last_hpage)
> > @@ -732,7 +792,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > if (!iov->iov_base)
> > return NULL;
> >
> > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> > if (!node)
> > return ERR_PTR(-ENOMEM);
> > node->buf = NULL;
> > @@ -752,7 +812,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
> > coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
> > }
> >
> > - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
> > + imu = io_alloc_imu(ctx, nr_pages);
> > if (!imu)
> > goto done;
> >
> > @@ -800,9 +860,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> > unsigned int nr_args, u64 __user *tags)
> > {
> > struct page *last_hpage = NULL;
> > - struct io_rsrc_data data;
> > struct iovec fast_iov, *iov = &fast_iov;
> > const struct iovec __user *uvec;
> > + struct io_buf_table table;
> > int i, ret;
> >
> > BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
> > @@ -811,13 +871,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> > return -EBUSY;
> > if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
> > return -EINVAL;
> > - ret = io_rsrc_data_alloc(&data, nr_args);
> > + ret = io_rsrc_buffer_alloc(&table, nr_args);
> > if (ret)
> > return ret;
> >
> > if (!arg)
> > memset(iov, 0, sizeof(*iov));
> >
> > + ctx->buf_table = table;
> > for (i = 0; i < nr_args; i++) {
> > struct io_rsrc_node *node;
> > u64 tag = 0;
> > @@ -857,10 +918,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
> > }
> > node->tag = tag;
> > }
> > - data.nodes[i] = node;
> > + table.data.nodes[i] = node;
> > }
> > -
> > - ctx->buf_table.data = data;
>
> Is it necessary to move this assignment? I found the existing location
> easier to reason about, since the assignment of ctx->buf_table
> represents a transfer of ownership from the local variable.
>
> > if (ret)
> > io_sqe_buffers_unregister(ctx);
> > return ret;
> > @@ -891,14 +950,15 @@ int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq,
> > goto unlock;
> > }
> >
> > - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> > if (!node) {
> > ret = -ENOMEM;
> > goto unlock;
> > }
> >
> > nr_bvecs = blk_rq_nr_phys_segments(rq);
> > - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
> > +
> > + imu = io_alloc_imu(ctx, nr_bvecs);
> > if (!imu) {
> > kfree(node);
> > ret = -ENOMEM;
> > @@ -1028,7 +1088,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
> > static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
> > struct io_uring_clone_buffers *arg)
> > {
> > - struct io_rsrc_data data;
> > + struct io_buf_table table;
> > int i, ret, off, nr;
> > unsigned int nbufs;
> >
> > @@ -1059,7 +1119,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
> > return -EOVERFLOW;
> >
> > - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
> > + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
> > if (ret)
> > return ret;
> >
> > @@ -1068,7 +1128,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
> >
> > if (src_node) {
> > - data.nodes[i] = src_node;
> > + table.data.nodes[i] = src_node;
> > src_node->refs++;
> > }
> > }
> > @@ -1098,7 +1158,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > if (!src_node) {
> > dst_node = NULL;
> > } else {
> > - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
> > + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
> > if (!dst_node) {
> > ret = -ENOMEM;
> > goto out_free;
> > @@ -1107,12 +1167,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > refcount_inc(&src_node->buf->refs);
> > dst_node->buf = src_node->buf;
> > }
> > - data.nodes[off++] = dst_node;
> > + table.data.nodes[off++] = dst_node;
> > i++;
> > }
> >
> > /*
> > - * If asked for replace, put the old table. data->nodes[] holds both
> > + * If asked for replace, put the old table. table.data->nodes[] holds both
> > * old and new nodes at this point.
> > */
> > if (arg->flags & IORING_REGISTER_DST_REPLACE)
> > @@ -1125,10 +1185,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
> > * entry).
> > */
> > WARN_ON_ONCE(ctx->buf_table.data.nr);
> > - ctx->buf_table.data = data;
> > + ctx->buf_table = table;
> > return 0;
> > out_free:
> > - io_rsrc_data_free(ctx, &data);
> > + io_rsrc_buffer_free(ctx, &table);
> > return ret;
> > }
> >
> > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
> > index 2e8d1862caefc..c5bdac558a2b4 100644
> > --- a/io_uring/rsrc.h
> > +++ b/io_uring/rsrc.h
> > @@ -47,7 +47,7 @@ struct io_imu_folio_data {
> > unsigned int nr_folios;
> > };
> >
> > -struct io_rsrc_node *io_rsrc_node_alloc(int type);
> > +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
> > void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
> > void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
> > int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
> > --
> > 2.43.5
> >
> >
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-16 22:43 ` Caleb Sander Mateos
@ 2025-02-18 20:12 ` Keith Busch
2025-02-18 20:45 ` Caleb Sander Mateos
0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2025-02-18 20:12 UTC (permalink / raw)
To: Caleb Sander Mateos
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Sun, Feb 16, 2025 at 02:43:40PM -0800, Caleb Sander Mateos wrote:
> > > + io_alloc_cache_free(&table->imu_cache, kfree);
> > > +}
> > > +
> > > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> > > {
> > > if (!ctx->buf_table.data.nr)
> > > return -ENXIO;
> > > - io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > > + io_rsrc_buffer_free(ctx, &ctx->buf_table);
> > > return 0;
> > > }
> > >
> > > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> > > return true;
> > > }
> > >
> > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > > + int nr_bvecs)
> > > +{
> > > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> >
> > If there is no entry available in the cache, this will heap-allocate
> > one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> > using io_alloc_cache_get() instead of io_cache_alloc(), so the
> > heap-allocated fallback uses the minimal size.
> >
> > Also, where are these allocations returned to the imu_cache? Looks
> > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> > needs to try io_alloc_cache_put() first.
>
> Another issue I see is that io_alloc_cache elements are allocated with
> kmalloc(), so they can't be freed with kvfree().
You actually can kvfree(kmalloc()); Here's the kernel doc for it:
kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc()
> When the imu is
> freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
> to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().
But you're right, it shouldn't even hit this path because it's supposed
to try to insert the imu into the cache if that's where it was allocated
from.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-18 20:12 ` Keith Busch
@ 2025-02-18 20:45 ` Caleb Sander Mateos
0 siblings, 0 replies; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-18 20:45 UTC (permalink / raw)
To: Keith Busch
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Tue, Feb 18, 2025 at 12:12 PM Keith Busch <[email protected]> wrote:
>
> On Sun, Feb 16, 2025 at 02:43:40PM -0800, Caleb Sander Mateos wrote:
> > > > + io_alloc_cache_free(&table->imu_cache, kfree);
> > > > +}
> > > > +
> > > > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
> > > > {
> > > > if (!ctx->buf_table.data.nr)
> > > > return -ENXIO;
> > > > - io_rsrc_data_free(ctx, &ctx->buf_table.data);
> > > > + io_rsrc_buffer_free(ctx, &ctx->buf_table);
> > > > return 0;
> > > > }
> > > >
> > > > @@ -716,6 +767,15 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
> > > > return true;
> > > > }
> > > >
> > > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > > > + int nr_bvecs)
> > > > +{
> > > > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > > > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> > >
> > > If there is no entry available in the cache, this will heap-allocate
> > > one with enough space for all IO_CACHED_BVECS_SEGS bvecs. Consider
> > > using io_alloc_cache_get() instead of io_cache_alloc(), so the
> > > heap-allocated fallback uses the minimal size.
> > >
> > > Also, where are these allocations returned to the imu_cache? Looks
> > > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> > > needs to try io_alloc_cache_put() first.
> >
> > Another issue I see is that io_alloc_cache elements are allocated with
> > kmalloc(), so they can't be freed with kvfree().
>
> You actually can kvfree(kmalloc()); Here's the kernel doc for it:
>
> kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc()
Good to know, thanks for the pointer! I guess it might be a bit more
efficient to call kfree() if we know based on nr_bvecs that the
allocation came from kmalloc(), but at least this isn't corrupting the
heap.
Best,
Caleb
>
> > When the imu is
> > freed, we could check nr_bvecs <= IO_CACHED_BVECS_SEGS to tell whether
> > to call io_alloc_cache_put() (with a fallback to kfree()) or kvfree().
>
> But you're right, it shouldn't even hit this path because it's supposed
> to try to insert the imu into the cache if that's where it was allocated
> from.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-15 2:22 ` Caleb Sander Mateos
2025-02-16 22:43 ` Caleb Sander Mateos
@ 2025-02-18 20:09 ` Keith Busch
2025-02-18 20:42 ` Caleb Sander Mateos
1 sibling, 1 reply; 17+ messages in thread
From: Keith Busch @ 2025-02-18 20:09 UTC (permalink / raw)
To: Caleb Sander Mateos
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Fri, Feb 14, 2025 at 06:22:09PM -0800, Caleb Sander Mateos wrote:
> > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > + int nr_bvecs)
> > +{
> > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
>
> If there is no entry available in the cache, this will heap-allocate
> one with enough space for all IO_CACHED_BVECS_SEGS bvecs.
The cache can only have fixed size objects in them, so we have to pick
some kind of trade off. The cache starts off empty and fills up on
demand, so whatever we allocate needs to be of that cache's element
size.
> Consider
> using io_alloc_cache_get() instead of io_cache_alloc(), so the
> heap-allocated fallback uses the minimal size.
We wouldn't be able to fill the cache with the new dynamic object if we
did that.
> Also, where are these allocations returned to the imu_cache? Looks
> like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> needs to try io_alloc_cache_put() first.
Oops. I kind of rushed this series last Friday as I needed to shut down
very early in the day.
Thanks for the comments. Will take my time for the next version.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-18 20:09 ` Keith Busch
@ 2025-02-18 20:42 ` Caleb Sander Mateos
2025-02-18 21:12 ` Keith Busch
0 siblings, 1 reply; 17+ messages in thread
From: Caleb Sander Mateos @ 2025-02-18 20:42 UTC (permalink / raw)
To: Keith Busch
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Tue, Feb 18, 2025 at 12:09 PM Keith Busch <[email protected]> wrote:
>
> On Fri, Feb 14, 2025 at 06:22:09PM -0800, Caleb Sander Mateos wrote:
> > > +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
> > > + int nr_bvecs)
> > > +{
> > > + if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
> > > + return io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL);
> >
> > If there is no entry available in the cache, this will heap-allocate
> > one with enough space for all IO_CACHED_BVECS_SEGS bvecs.
>
> The cache can only have fixed size objects in them, so we have to pick
> some kind of trade off. The cache starts off empty and fills up on
> demand, so whatever we allocate needs to be of that cache's element
> size.
>
> > Consider
> > using io_alloc_cache_get() instead of io_cache_alloc(), so the
> > heap-allocated fallback uses the minimal size.
>
> We wouldn't be able to fill the cache with the new dynamic object if we
> did that.
Right, that's a good point that there's a tradeoff. I think always
allocating space for IO_CACHED_BVECS_SEGS bvecs is reasonable. Maybe
IO_CACHED_BVECS_SEGS should be slightly smaller so the allocation fits
nicely in a power of 2? Currently it looks to take up 560 bytes:
>>> 48 + 16 * 32
560
Using IO_CACHED_BVECS_SEGS = 29 instead would make it 512 bytes:
>>> 48 + 16 * 29
512
Best,
Caleb
>
> > Also, where are these allocations returned to the imu_cache? Looks
> > like kvfree(imu) in io_buffer_unmap() and io_sqe_buffer_register()
> > needs to try io_alloc_cache_put() first.
>
> Oops. I kind of rushed this series last Friday as I needed to shut down
> very early in the day.
>
> Thanks for the comments. Will take my time for the next version.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCHv3 5/5] io_uring: cache nodes and mapped buffers
2025-02-18 20:42 ` Caleb Sander Mateos
@ 2025-02-18 21:12 ` Keith Busch
0 siblings, 0 replies; 17+ messages in thread
From: Keith Busch @ 2025-02-18 21:12 UTC (permalink / raw)
To: Caleb Sander Mateos
Cc: Keith Busch, ming.lei, asml.silence, axboe, linux-block, io-uring,
bernd
On Tue, Feb 18, 2025 at 12:42:19PM -0800, Caleb Sander Mateos wrote:
> Right, that's a good point that there's a tradeoff. I think always
> allocating space for IO_CACHED_BVECS_SEGS bvecs is reasonable. Maybe
> IO_CACHED_BVECS_SEGS should be slightly smaller so the allocation fits
> nicely in a power of 2? Currently it looks to take up 560 bytes:
> >>> 48 + 16 * 32
> 560
>
> Using IO_CACHED_BVECS_SEGS = 29 instead would make it 512 bytes:
> >>> 48 + 16 * 29
> 512
Right, and it's even smaller on 32-bit architectures.
I don't think it's worth optimizing the cached object size like this.
It's probably better we optimize for a particular transfer size. If your
bvec is physically discontiguous, 32 bvecs gets you to 128k transfers on
a 4k page platform. That seems like a common transfer size for
benchmarking throughput. It is arbitrary at the end of the day, so I'm
not set on that if there's an argument for something different.
^ permalink raw reply [flat|nested] 17+ messages in thread