* [PATCH 1/6] block: const blk_rq_nr_phys_segments request
2025-02-03 15:45 [PATCH 0/6] ublk zero-copy support Keith Busch
@ 2025-02-03 15:45 ` Keith Busch
2025-02-03 15:45 ` [PATCH 2/6] io_uring: use node for import Keith Busch
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-02-03 15:45 UTC (permalink / raw)
To: io-uring, linux-block, ming.lei, axboe, asml.silence; +Cc: Keith Busch
From: Keith Busch <[email protected]>
The request is not modified. Mark it as const so that other const
functions may use this helper.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/blk-mq.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a0a9007cc1e36..56ef03bc68884 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1125,7 +1125,7 @@ void blk_abort_request(struct request *);
* own special payload. In that case we still return 1 here so that this
* special payload will be mapped.
*/
-static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
+static inline unsigned short blk_rq_nr_phys_segments(const struct request *rq)
{
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
return 1;
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/6] io_uring: use node for import
2025-02-03 15:45 [PATCH 0/6] ublk zero-copy support Keith Busch
2025-02-03 15:45 ` [PATCH 1/6] block: const blk_rq_nr_phys_segments request Keith Busch
@ 2025-02-03 15:45 ` Keith Busch
2025-02-03 15:45 ` [PATCH 3/6] io_uring: add support for kernel registered bvecs Keith Busch
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-02-03 15:45 UTC (permalink / raw)
To: io-uring, linux-block, ming.lei, axboe, asml.silence; +Cc: Keith Busch
From: Jens Axboe <[email protected]>
Replace the mapped buffer to the parent node. This is preparing for a
future for different types with specific handling considerations.
Signed-off-by: Jens Axboe <[email protected]>
Signed-off-by: Keith Busch <[email protected]>
---
io_uring/net.c | 3 +--
io_uring/rsrc.c | 6 +++---
io_uring/rsrc.h | 5 ++---
io_uring/rw.c | 2 +-
io_uring/uring_cmd.c | 2 +-
5 files changed, 8 insertions(+), 10 deletions(-)
diff --git a/io_uring/net.c b/io_uring/net.c
index 85f55fbc25c94..4e9d0f04b902d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1359,8 +1359,7 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
return ret;
ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
- node->buf, (u64)(uintptr_t)sr->buf,
- sr->len);
+ node, (u64)(uintptr_t)sr->buf, sr->len);
if (unlikely(ret))
return ret;
kmsg->msg.sg_from_iter = io_sg_from_iter;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index af39b69eb4fde..4d0e1c06c8bc6 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -860,10 +860,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
+ u64 buf_addr, size_t len)
{
+ struct io_mapped_ubuf *imu = node->buf;
u64 buf_end;
size_t offset;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 190f7ee45de93..abd0d5d42c3e1 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,9 +50,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
+ u64 buf_addr, size_t len);
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index a9a2733be8420..d6332d019dd56 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -393,7 +393,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
io_req_assign_buf_node(req, node);
io = req->async_data;
- ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
+ ret = io_import_fixed(ddir, &io->iter, node, rw->addr, rw->len);
iov_iter_save_state(&io->iter, &io->iter_state);
return ret;
}
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index fc94c465a9850..b7b9baf30d728 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -281,7 +281,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
/* Must have had rsrc_node assigned at prep time */
if (node)
- return io_import_fixed(rw, iter, node->buf, ubuf, len);
+ return io_import_fixed(rw, iter, node, ubuf, len);
return -EFAULT;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/6] io_uring: add support for kernel registered bvecs
2025-02-03 15:45 [PATCH 0/6] ublk zero-copy support Keith Busch
2025-02-03 15:45 ` [PATCH 1/6] block: const blk_rq_nr_phys_segments request Keith Busch
2025-02-03 15:45 ` [PATCH 2/6] io_uring: use node for import Keith Busch
@ 2025-02-03 15:45 ` Keith Busch
2025-02-03 15:45 ` [PATCH 4/6] ublk: zc register/unregister bvec Keith Busch
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-02-03 15:45 UTC (permalink / raw)
To: io-uring, linux-block, ming.lei, axboe, asml.silence; +Cc: Keith Busch
From: Keith Busch <[email protected]>
Provide an interface for the kernel to leverage the existing
pre-registered buffers that io_uring provides. User space can reference
these later to achieve zero-copy IO.
User space must register an empty fixed buffer table with io_uring in
order for the kernel to make use of it.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring.h | 1 +
include/linux/io_uring_types.h | 3 +
io_uring/rsrc.c | 114 +++++++++++++++++++++++++++++++--
io_uring/rsrc.h | 1 +
4 files changed, 114 insertions(+), 5 deletions(-)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 85fe4e6b275c7..b5637a2aae340 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/xarray.h>
#include <uapi/linux/io_uring.h>
+#include <linux/blk-mq.h>
#if defined(CONFIG_IO_URING)
void __io_uring_cancel(bool cancel_all);
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 623d8e798a11a..7e5a5a70c35f2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -695,4 +695,7 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
return ctx->flags & IORING_SETUP_CQE32;
}
+int io_buffer_register_bvec(struct io_ring_ctx *ctx, const struct request *rq, unsigned int tag);
+void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int tag);
+
#endif
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4d0e1c06c8bc6..8c4c374abcc10 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -111,7 +111,10 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
if (!refcount_dec_and_test(&imu->refs))
return;
for (i = 0; i < imu->nr_bvecs; i++)
- unpin_user_page(imu->bvec[i].bv_page);
+ if (node->type == IORING_RSRC_KBUF)
+ put_page(imu->bvec[i].bv_page);
+ else
+ unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu);
@@ -240,6 +243,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_rsrc_node *node;
u64 tag = 0;
+ i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+ node = io_rsrc_node_lookup(&ctx->buf_table, i);
+ if (node && node->type != IORING_RSRC_BUFFER) {
+ err = -EBUSY;
+ break;
+ }
+
uvec = u64_to_user_ptr(user_data);
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
if (IS_ERR(iov)) {
@@ -258,6 +268,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
err = PTR_ERR(node);
break;
}
+
if (tag) {
if (!node) {
err = -EINVAL;
@@ -265,7 +276,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
}
node->tag = tag;
}
- i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
ctx->buf_table.nodes[i] = node;
if (ctx->compat)
@@ -453,6 +463,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
fput(io_slot_file(node));
break;
case IORING_RSRC_BUFFER:
+ case IORING_RSRC_KBUF:
if (node->buf)
io_buffer_unmap(ctx, node);
break;
@@ -860,6 +871,92 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+static struct io_rsrc_node *io_buffer_alloc_node(struct io_ring_ctx *ctx,
+ unsigned int nr_bvecs,
+ unsigned int len)
+{
+ struct io_mapped_ubuf *imu;
+ struct io_rsrc_node *node;
+
+ node = io_rsrc_node_alloc(IORING_RSRC_KBUF);
+ if (!node)
+ return NULL;
+
+ imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+ if (!imu) {
+ io_put_rsrc_node(ctx, node);
+ return NULL;
+ }
+
+ imu->ubuf = 0;
+ imu->len = len;
+ imu->acct_pages = 0;
+ imu->nr_bvecs = nr_bvecs;
+ refcount_set(&imu->refs, 1);
+
+ node->buf = imu;
+ return node;
+}
+
+int io_buffer_register_bvec(struct io_ring_ctx *ctx, const struct request *rq,
+ unsigned int index)
+{
+ struct io_rsrc_data *data = &ctx->buf_table;
+ u16 nr_bvecs = blk_rq_nr_phys_segments(rq);
+ struct req_iterator rq_iter;
+ struct io_rsrc_node *node;
+ struct bio_vec bv;
+ int i = 0;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ if (WARN_ON_ONCE(!data->nr))
+ return -EINVAL;
+ if (WARN_ON_ONCE(index >= data->nr))
+ return -EINVAL;
+
+ node = data->nodes[index];
+ if (WARN_ON_ONCE(node))
+ return -EBUSY;
+
+ node = io_buffer_alloc_node(ctx, nr_bvecs, blk_rq_bytes(rq));
+ if (!node)
+ return -ENOMEM;
+
+ rq_for_each_bvec(bv, rq, rq_iter) {
+ get_page(bv.bv_page);
+ node->buf->bvec[i].bv_page = bv.bv_page;
+ node->buf->bvec[i].bv_len = bv.bv_len;
+ node->buf->bvec[i].bv_offset = bv.bv_offset;
+ i++;
+ }
+ data->nodes[index] = node;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
+
+void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index)
+{
+ struct io_rsrc_data *data = &ctx->buf_table;
+ struct io_rsrc_node *node;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ if (WARN_ON_ONCE(!data->nr))
+ return;
+ if (WARN_ON_ONCE(index >= data->nr))
+ return;
+
+ node = data->nodes[index];
+ if (WARN_ON_ONCE(!node || !node->buf))
+ return;
+ if (WARN_ON_ONCE(node->type != IORING_RSRC_KBUF))
+ return;
+ io_reset_rsrc_node(ctx, data, index);
+}
+EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
+
int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
u64 buf_addr, size_t len)
{
@@ -886,8 +983,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
/*
* Don't use iov_iter_advance() here, as it's really slow for
* using the latter parts of a big fixed buffer - it iterates
- * over each segment manually. We can cheat a bit here, because
- * we know that:
+ * over each segment manually. We can cheat a bit here for user
+ * registered nodes, because we know that:
*
* 1) it's a BVEC iter, we set it up
* 2) all bvecs are the same in size, except potentially the
@@ -901,7 +998,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node,
*/
const struct bio_vec *bvec = imu->bvec;
- if (offset < bvec->bv_len) {
+ /*
+ * Kernel buffer bvecs, on the other hand, don't necessarily
+ * have the size property of user registered ones, so we have
+ * to use the slow iter advance.
+ */
+ if (node->type == IORING_RSRC_KBUF)
+ iov_iter_advance(iter, offset);
+ else if (offset < bvec->bv_len) {
iter->iov_offset = offset;
} else {
unsigned long seg_skip;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index abd0d5d42c3e1..d1d90d9cd2b43 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,6 +13,7 @@
enum {
IORING_RSRC_FILE = 0,
IORING_RSRC_BUFFER = 1,
+ IORING_RSRC_KBUF = 2,
};
struct io_rsrc_node {
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 4/6] ublk: zc register/unregister bvec
2025-02-03 15:45 [PATCH 0/6] ublk zero-copy support Keith Busch
` (2 preceding siblings ...)
2025-02-03 15:45 ` [PATCH 3/6] io_uring: add support for kernel registered bvecs Keith Busch
@ 2025-02-03 15:45 ` Keith Busch
2025-02-03 15:45 ` [PATCH 5/6] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-03 15:45 ` [PATCH 6/6] io_uring: cache nodes and mapped buffers Keith Busch
5 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-02-03 15:45 UTC (permalink / raw)
To: io-uring, linux-block, ming.lei, axboe, asml.silence; +Cc: Keith Busch
From: Keith Busch <[email protected]>
Provide new operations for the user to request mapping an active request
to an io uring instance's buf_table. The user has to provide the index
it wants to install the buffer.
A reference count is taken on the request to ensure it can't be
completed while it is active in a ring's buf_table.
Signed-off-by: Keith Busch <[email protected]>
---
drivers/block/ublk_drv.c | 139 +++++++++++++++++++++++++---------
include/uapi/linux/ublk_cmd.h | 4 +
2 files changed, 107 insertions(+), 36 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 529085181f355..58f224b5687b9 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -51,6 +51,9 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
+#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
+
/* All UBLK_F_* have to be included into UBLK_F_ALL */
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
| UBLK_F_URING_CMD_COMP_IN_TASK \
@@ -76,6 +79,9 @@ struct ublk_rq_data {
struct llist_node node;
struct kref ref;
+
+#define UBLK_ZC_REGISTERED 0
+ unsigned long flags;
};
struct ublk_uring_cmd_pdu {
@@ -201,7 +207,7 @@ static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
- return ub->dev_info.flags & UBLK_F_USER_COPY;
+ return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
}
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
@@ -581,7 +587,7 @@ static void ublk_apply_params(struct ublk_device *ub)
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
- return ubq->flags & UBLK_F_USER_COPY;
+ return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -1747,6 +1753,96 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+
+static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
+ struct ublk_queue *ubq, int tag, size_t offset)
+{
+ struct request *req;
+
+ if (!ublk_need_req_ref(ubq))
+ return NULL;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req)
+ return NULL;
+
+ if (!ublk_get_req_ref(ubq, req))
+ return NULL;
+
+ if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
+ goto fail_put;
+
+ if (!ublk_rq_has_data(req))
+ goto fail_put;
+
+ if (offset > blk_rq_bytes(req))
+ goto fail_put;
+
+ return req;
+fail_put:
+ ublk_put_req_ref(ubq, req);
+ return NULL;
+}
+
+static int ublk_register_io_buf(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag,
+ const struct ublksrv_io_cmd *ub_cmd)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct ublk_device *ub = cmd->file->private_data;
+ int index = (int)ub_cmd->addr, ret;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ req = __ublk_check_and_get_req(ub, ubq, tag, 0);
+ if (!req)
+ return -EINVAL;
+
+ data = blk_mq_rq_to_pdu(req);
+ if (test_and_set_bit(UBLK_ZC_REGISTERED, &data->flags)) {
+ ublk_put_req_ref(ubq, req);
+ return -EBUSY;
+ }
+
+ ret = io_buffer_register_bvec(ctx, req, index);
+ if (ret) {
+ clear_bit(UBLK_ZC_REGISTERED, &data->flags);
+ ublk_put_req_ref(ubq, req);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag,
+ const struct ublksrv_io_cmd *ub_cmd)
+{
+ struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
+ struct ublk_device *ub = cmd->file->private_data;
+ int index = (int)ub_cmd->addr;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req)
+ return -EINVAL;
+
+ data = blk_mq_rq_to_pdu(req);
+ if (!test_and_clear_bit(UBLK_ZC_REGISTERED, &data->flags))
+ return -EINVAL;
+
+ ublk_put_req_ref(ubq, req);
+ io_buffer_unregister_bvec(ctx, index);
+ return 0;
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1798,6 +1894,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
+ case UBLK_IO_REGISTER_IO_BUF:
+ return ublk_register_io_buf(cmd, ubq, tag, ub_cmd);
+ case UBLK_IO_UNREGISTER_IO_BUF:
+ return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd);
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
@@ -1872,36 +1972,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
return -EIOCBQUEUED;
}
-static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- struct ublk_queue *ubq, int tag, size_t offset)
-{
- struct request *req;
-
- if (!ublk_need_req_ref(ubq))
- return NULL;
-
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
- if (!req)
- return NULL;
-
- if (!ublk_get_req_ref(ubq, req))
- return NULL;
-
- if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
- goto fail_put;
-
- if (!ublk_rq_has_data(req))
- goto fail_put;
-
- if (offset > blk_rq_bytes(req))
- goto fail_put;
-
- return req;
-fail_put:
- ublk_put_req_ref(ubq, req);
- return NULL;
-}
-
static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@@ -2527,9 +2597,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_free_dev_number;
}
- /* We are not ready to support zero copy */
- ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
-
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
@@ -2860,7 +2927,7 @@ static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
{
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
- u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
+ u64 features = UBLK_F_ALL;
if (header->len != UBLK_FEATURES_LEN || !header->addr)
return -EINVAL;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a8bc98bb69fce..74246c926b55f 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -94,6 +94,10 @@
_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
#define UBLK_U_IO_NEED_GET_DATA \
_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+#define UBLK_U_IO_REGISTER_IO_BUF \
+ _IOWR('u', 0x23, struct ublksrv_io_cmd)
+#define UBLK_U_IO_UNREGISTER_IO_BUF \
+ _IOWR('u', 0x24, struct ublksrv_io_cmd)
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/6] io_uring: add abstraction for buf_table rsrc data
2025-02-03 15:45 [PATCH 0/6] ublk zero-copy support Keith Busch
` (3 preceding siblings ...)
2025-02-03 15:45 ` [PATCH 4/6] ublk: zc register/unregister bvec Keith Busch
@ 2025-02-03 15:45 ` Keith Busch
2025-02-03 15:45 ` [PATCH 6/6] io_uring: cache nodes and mapped buffers Keith Busch
5 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-02-03 15:45 UTC (permalink / raw)
To: io-uring, linux-block, ming.lei, axboe, asml.silence; +Cc: Keith Busch
From: Keith Busch <[email protected]>
We'll need to add more fields specific to the registered buffers, so
make a layer for it now. No functional change in this patch.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring_types.h | 6 +++-
io_uring/fdinfo.c | 8 +++---
io_uring/net.c | 2 +-
io_uring/nop.c | 2 +-
io_uring/register.c | 2 +-
io_uring/rsrc.c | 51 +++++++++++++++++-----------------
io_uring/rw.c | 2 +-
io_uring/uring_cmd.c | 2 +-
8 files changed, 39 insertions(+), 36 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 7e5a5a70c35f2..aa661ebfd6568 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,6 +67,10 @@ struct io_file_table {
unsigned int alloc_hint;
};
+struct io_buf_table {
+ struct io_rsrc_data data;
+};
+
struct io_hash_bucket {
struct hlist_head list;
} ____cacheline_aligned_in_smp;
@@ -290,7 +294,7 @@ struct io_ring_ctx {
struct io_wq_work_list iopoll_list;
struct io_file_table file_table;
- struct io_rsrc_data buf_table;
+ struct io_buf_table buf_table;
struct io_submit_state submit_state;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index f60d0a9d505e2..d389c06cbce10 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -217,12 +217,12 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_puts(m, "\n");
}
}
- seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
- for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+ seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.data.nr);
+ for (i = 0; has_lock && i < ctx->buf_table.data.nr; i++) {
struct io_mapped_ubuf *buf = NULL;
- if (ctx->buf_table.nodes[i])
- buf = ctx->buf_table.nodes[i]->buf;
+ if (ctx->buf_table.data.nodes[i])
+ buf = ctx->buf_table.data.nodes[i]->buf;
if (buf)
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
else
diff --git a/io_uring/net.c b/io_uring/net.c
index 4e9d0f04b902d..4917786456cf8 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1348,7 +1348,7 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
ret = -EFAULT;
io_ring_submit_lock(ctx, issue_flags);
- node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
+ node = io_rsrc_node_lookup(&ctx->buf_table.data, sr->buf_index);
if (node) {
io_req_assign_buf_node(sr->notif, node);
ret = 0;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 5e5196df650a1..e3ebe5f019076 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -69,7 +69,7 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
ret = -EFAULT;
io_ring_submit_lock(ctx, issue_flags);
- node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
+ node = io_rsrc_node_lookup(&ctx->buf_table.data, nop->buffer);
if (node) {
io_req_assign_buf_node(req, node);
ret = 0;
diff --git a/io_uring/register.c b/io_uring/register.c
index 0db181437ae33..e8f00b19e75f6 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -919,7 +919,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ret = __io_uring_register(ctx, opcode, arg, nr_args);
trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
- ctx->buf_table.nr, ret);
+ ctx->buf_table.data.nr, ret);
mutex_unlock(&ctx->uring_lock);
fput(file);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 8c4c374abcc10..864c2eabf8efd 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -234,17 +234,17 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
__u32 done;
int i, err;
- if (!ctx->buf_table.nr)
+ if (!ctx->buf_table.data.nr)
return -ENXIO;
- if (up->offset + nr_args > ctx->buf_table.nr)
+ if (up->offset + nr_args > ctx->buf_table.data.nr)
return -EINVAL;
for (done = 0; done < nr_args; done++) {
struct io_rsrc_node *node;
u64 tag = 0;
- i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
- node = io_rsrc_node_lookup(&ctx->buf_table, i);
+ i = array_index_nospec(up->offset + done, ctx->buf_table.data.nr);
+ node = io_rsrc_node_lookup(&ctx->buf_table.data, i);
if (node && node->type != IORING_RSRC_BUFFER) {
err = -EBUSY;
break;
@@ -276,8 +276,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
}
node->tag = tag;
}
- io_reset_rsrc_node(ctx, &ctx->buf_table, i);
- ctx->buf_table.nodes[i] = node;
+ io_reset_rsrc_node(ctx, &ctx->buf_table.data, i);
+ ctx->buf_table.data.nodes[i] = node;
if (ctx->compat)
user_data += sizeof(struct compat_iovec);
else
@@ -556,9 +556,9 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
- if (!ctx->buf_table.nr)
+ if (!ctx->buf_table.data.nr)
return -ENXIO;
- io_rsrc_data_free(ctx, &ctx->buf_table);
+ io_rsrc_data_free(ctx, &ctx->buf_table.data);
return 0;
}
@@ -585,8 +585,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
}
/* check previously registered pages */
- for (i = 0; i < ctx->buf_table.nr; i++) {
- struct io_rsrc_node *node = ctx->buf_table.nodes[i];
+ for (i = 0; i < ctx->buf_table.data.nr; i++) {
+ struct io_rsrc_node *node = ctx->buf_table.data.nodes[i];
struct io_mapped_ubuf *imu;
if (!node)
@@ -812,7 +812,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
- if (ctx->buf_table.nr)
+ if (ctx->buf_table.data.nr)
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
@@ -865,7 +865,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
data.nodes[i] = node;
}
- ctx->buf_table = data;
+ ctx->buf_table.data = data;
if (ret)
io_sqe_buffers_unregister(ctx);
return ret;
@@ -901,7 +901,7 @@ static struct io_rsrc_node *io_buffer_alloc_node(struct io_ring_ctx *ctx,
int io_buffer_register_bvec(struct io_ring_ctx *ctx, const struct request *rq,
unsigned int index)
{
- struct io_rsrc_data *data = &ctx->buf_table;
+ struct io_rsrc_data *data = &ctx->buf_table.data;
u16 nr_bvecs = blk_rq_nr_phys_segments(rq);
struct req_iterator rq_iter;
struct io_rsrc_node *node;
@@ -938,7 +938,7 @@ EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index)
{
- struct io_rsrc_data *data = &ctx->buf_table;
+ struct io_rsrc_data *data = &ctx->buf_table.data;
struct io_rsrc_node *node;
lockdep_assert_held(&ctx->uring_lock);
@@ -1054,10 +1054,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (!arg->nr && (arg->dst_off || arg->src_off))
return -EINVAL;
/* not allowed unless REPLACE is set */
- if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
+ if (ctx->buf_table.data.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
return -EBUSY;
- nbufs = src_ctx->buf_table.nr;
+ nbufs = src_ctx->buf_table.data.nr;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
@@ -1067,13 +1067,13 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
- ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
+ ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
if (ret)
return ret;
/* Fill entries in data from dst that won't overlap with src */
- for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
- struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+ for (i = 0; i < min(arg->dst_off, ctx->buf_table.data.nr); i++) {
+ struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
if (src_node) {
data.nodes[i] = src_node;
@@ -1082,7 +1082,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
}
ret = -ENXIO;
- nbufs = src_ctx->buf_table.nr;
+ nbufs = src_ctx->buf_table.data.nr;
if (!nbufs)
goto out_free;
ret = -EINVAL;
@@ -1102,7 +1102,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
while (nr--) {
struct io_rsrc_node *dst_node, *src_node;
- src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
+ src_node = io_rsrc_node_lookup(&src_ctx->buf_table.data, i);
if (!src_node) {
dst_node = NULL;
} else {
@@ -1124,7 +1124,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* old and new nodes at this point.
*/
if (arg->flags & IORING_REGISTER_DST_REPLACE)
- io_rsrc_data_free(ctx, &ctx->buf_table);
+ io_sqe_buffers_unregister(ctx);
/*
* ctx->buf_table must be empty now - either the contents are being
@@ -1132,10 +1132,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* copied to a ring that does not have buffers yet (checked at function
* entry).
*/
- WARN_ON_ONCE(ctx->buf_table.nr);
- ctx->buf_table = data;
+ WARN_ON_ONCE(ctx->buf_table.data.nr);
+ ctx->buf_table.data = data;
return 0;
-
out_free:
io_rsrc_data_free(ctx, &data);
return ret;
@@ -1160,7 +1159,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
return -EFAULT;
if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
return -EINVAL;
- if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
+ if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.data.nr)
return -EBUSY;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d6332d019dd56..f49ae3de94317 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -387,7 +387,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
if (unlikely(ret))
return ret;
- node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
+ node = io_rsrc_node_lookup(&ctx->buf_table.data, req->buf_index);
if (!node)
return -EFAULT;
io_req_assign_buf_node(req, node);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index b7b9baf30d728..5c9f14d700373 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -213,7 +213,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_rsrc_node *node;
u16 index = READ_ONCE(sqe->buf_index);
- node = io_rsrc_node_lookup(&ctx->buf_table, index);
+ node = io_rsrc_node_lookup(&ctx->buf_table.data, index);
if (unlikely(!node))
return -EFAULT;
/*
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 6/6] io_uring: cache nodes and mapped buffers
2025-02-03 15:45 [PATCH 0/6] ublk zero-copy support Keith Busch
` (4 preceding siblings ...)
2025-02-03 15:45 ` [PATCH 5/6] io_uring: add abstraction for buf_table rsrc data Keith Busch
@ 2025-02-03 15:45 ` Keith Busch
5 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-02-03 15:45 UTC (permalink / raw)
To: io-uring, linux-block, ming.lei, axboe, asml.silence; +Cc: Keith Busch
From: Keith Busch <[email protected]>
Frequent alloc/free cycles on these is pretty costly. Use an io cache to
more efficiently reuse these buffers.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring_types.h | 16 ++---
io_uring/filetable.c | 2 +-
io_uring/rsrc.c | 108 ++++++++++++++++++++++++---------
io_uring/rsrc.h | 2 +-
4 files changed, 92 insertions(+), 36 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index aa661ebfd6568..c0e0c1f92e5b1 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,8 +67,17 @@ struct io_file_table {
unsigned int alloc_hint;
};
+struct io_alloc_cache {
+ void **entries;
+ unsigned int nr_cached;
+ unsigned int max_cached;
+ size_t elem_size;
+};
+
struct io_buf_table {
struct io_rsrc_data data;
+ struct io_alloc_cache node_cache;
+ struct io_alloc_cache imu_cache;
};
struct io_hash_bucket {
@@ -222,13 +231,6 @@ struct io_submit_state {
struct blk_plug plug;
};
-struct io_alloc_cache {
- void **entries;
- unsigned int nr_cached;
- unsigned int max_cached;
- size_t elem_size;
-};
-
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index dd8eeec97acf6..a21660e3145ab 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
if (slot_index >= ctx->file_table.data.nr)
return -EINVAL;
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node)
return -ENOMEM;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 864c2eabf8efd..5434b0d992d62 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -117,23 +117,39 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
- kvfree(imu);
+ if (struct_size(imu, bvec, imu->nr_bvecs) >
+ ctx->buf_table.imu_cache.elem_size ||
+ !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu))
+ kvfree(imu);
}
}
-struct io_rsrc_node *io_rsrc_node_alloc(int type)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
{
struct io_rsrc_node *node;
- node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (type == IORING_RSRC_FILE)
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ else
+ node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL, NULL);
if (node) {
node->type = type;
node->refs = 1;
+ node->tag = 0;
+ node->file_ptr = 0;
}
return node;
}
-__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
+static __cold void __io_rsrc_data_free(struct io_rsrc_data *data)
+{
+ kvfree(data->nodes);
+ data->nodes = NULL;
+ data->nr = 0;
+}
+
+__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
+ struct io_rsrc_data *data)
{
if (!data->nr)
return;
@@ -141,9 +157,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data
if (data->nodes[data->nr])
io_put_rsrc_node(ctx, data->nodes[data->nr]);
}
- kvfree(data->nodes);
- data->nodes = NULL;
- data->nr = 0;
+ __io_rsrc_data_free(data);
}
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
@@ -157,6 +171,31 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
return -ENOMEM;
}
+static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr)
+{
+ int ret;
+
+ ret = io_rsrc_data_alloc(&table->data, nr);
+ if (ret)
+ return ret;
+
+ ret = io_alloc_cache_init(&table->node_cache, nr,
+ sizeof(struct io_rsrc_node));
+ if (ret)
+ goto out_1;
+
+ ret = io_alloc_cache_init(&table->imu_cache, nr, 512);
+ if (ret)
+ goto out_2;
+
+ return 0;
+out_2:
+ io_alloc_cache_free(&table->node_cache, kfree);
+out_1:
+ __io_rsrc_data_free(&table->data);
+ return ret;
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned nr_args)
@@ -206,7 +245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
err = -ENOMEM;
fput(file);
@@ -466,6 +505,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
case IORING_RSRC_KBUF:
if (node->buf)
io_buffer_unmap(ctx, node);
+ if (io_alloc_cache_put(&ctx->buf_table.node_cache, node))
+ return;
break;
default:
WARN_ON_ONCE(1);
@@ -534,7 +575,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto fail;
}
ret = -ENOMEM;
- node = io_rsrc_node_alloc(IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
if (!node) {
fput(file);
goto fail;
@@ -554,11 +595,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+static void io_rsrc_buffer_free(struct io_ring_ctx *ctx,
+ struct io_buf_table *table)
+{
+ io_rsrc_data_free(ctx, &table->data);
+ io_alloc_cache_free(&table->node_cache, kfree);
+ io_alloc_cache_free(&table->imu_cache, kfree);
+}
+
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
if (!ctx->buf_table.data.nr)
return -ENXIO;
- io_rsrc_data_free(ctx, &ctx->buf_table.data);
+ io_rsrc_buffer_free(ctx, &ctx->buf_table);
return 0;
}
@@ -739,7 +788,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
if (!iov->iov_base)
return NULL;
- node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!node)
return ERR_PTR(-ENOMEM);
node->buf = NULL;
@@ -759,7 +808,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
}
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ if (struct_size(imu, bvec, nr_pages) > ctx->buf_table.imu_cache.elem_size)
+ imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ else
+ imu = io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL, NULL);
if (!imu)
goto done;
@@ -805,9 +857,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags)
{
struct page *last_hpage = NULL;
- struct io_rsrc_data data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec;
+ struct io_buf_table table;
int i, ret;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@@ -816,13 +868,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
- ret = io_rsrc_data_alloc(&data, nr_args);
+ ret = io_rsrc_buffer_alloc(&table, nr_args);
if (ret)
return ret;
if (!arg)
memset(iov, 0, sizeof(*iov));
+ ctx->buf_table = table;
for (i = 0; i < nr_args; i++) {
struct io_rsrc_node *node;
u64 tag = 0;
@@ -862,10 +915,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
node->tag = tag;
}
- data.nodes[i] = node;
+ table.data.nodes[i] = node;
}
-
- ctx->buf_table.data = data;
if (ret)
io_sqe_buffers_unregister(ctx);
return ret;
@@ -878,11 +929,14 @@ static struct io_rsrc_node *io_buffer_alloc_node(struct io_ring_ctx *ctx,
struct io_mapped_ubuf *imu;
struct io_rsrc_node *node;
- node = io_rsrc_node_alloc(IORING_RSRC_KBUF);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_KBUF);
if (!node)
return NULL;
- imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+ if (struct_size(imu, bvec, nr_bvecs) > ctx->buf_table.imu_cache.elem_size)
+ imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
+ else
+ imu = io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL, NULL);
if (!imu) {
io_put_rsrc_node(ctx, node);
return NULL;
@@ -1036,7 +1090,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
struct io_uring_clone_buffers *arg)
{
- struct io_rsrc_data data;
+ struct io_buf_table table;
int i, ret, off, nr;
unsigned int nbufs;
@@ -1067,7 +1121,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
- ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr));
+ ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr));
if (ret)
return ret;
@@ -1076,7 +1130,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i];
if (src_node) {
- data.nodes[i] = src_node;
+ table.data.nodes[i] = src_node;
src_node->refs++;
}
}
@@ -1106,7 +1160,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (!src_node) {
dst_node = NULL;
} else {
- dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
+ dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
if (!dst_node) {
ret = -ENOMEM;
goto out_free;
@@ -1115,12 +1169,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
refcount_inc(&src_node->buf->refs);
dst_node->buf = src_node->buf;
}
- data.nodes[off++] = dst_node;
+ table.data.nodes[off++] = dst_node;
i++;
}
/*
- * If asked for replace, put the old table. data->nodes[] holds both
+ * If asked for replace, put the old table. table.data->nodes[] holds both
* old and new nodes at this point.
*/
if (arg->flags & IORING_REGISTER_DST_REPLACE)
@@ -1133,10 +1187,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
* entry).
*/
WARN_ON_ONCE(ctx->buf_table.data.nr);
- ctx->buf_table.data = data;
+ ctx->buf_table = table;
return 0;
out_free:
- io_rsrc_data_free(ctx, &data);
+ io_rsrc_buffer_free(ctx, &table);
return ret;
}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index d1d90d9cd2b43..759ac373b0dc6 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -46,7 +46,7 @@ struct io_imu_folio_data {
unsigned int nr_folios;
};
-struct io_rsrc_node *io_rsrc_node_alloc(int type);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread