From: Ming Lei <[email protected]>
To: Jens Axboe <[email protected]>,
[email protected], Pavel Begunkov <[email protected]>
Cc: [email protected],
Uday Shankar <[email protected]>,
Akilesh Kailash <[email protected]>,
Ming Lei <[email protected]>
Subject: [PATCH V10 12/12] ublk: support leasing io buffer to io_uring
Date: Thu, 7 Nov 2024 19:01:45 +0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
Suopport to lease block IO buffer for userpace to run io_uring operations(FS,
network IO), then ublk zero copy can be supported.
userspace code:
git clone https://github.com/ublk-org/ublksrv.git -b uring_group
And both loop and nbd zero copy(io_uring send and send zc) are covered.
Performance improvement is quite obvious in big block size test, such as
'loop --buffered_io' perf is doubled in 64KB block test("loop/007 vs
loop/009").
Signed-off-by: Ming Lei <[email protected]>
---
drivers/block/ublk_drv.c | 168 ++++++++++++++++++++++++++++++++--
include/uapi/linux/ublk_cmd.h | 11 ++-
2 files changed, 169 insertions(+), 10 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6ba2c1dd1d87..d90f4cac8154 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -51,6 +51,8 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_IO_PROVIDE_IO_BUF _IOC_NR(UBLK_U_IO_PROVIDE_IO_BUF)
+
/* All UBLK_F_* have to be included into UBLK_F_ALL */
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
| UBLK_F_URING_CMD_COMP_IN_TASK \
@@ -67,10 +69,18 @@
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
+struct uring_kbuf {
+ struct io_rsrc_node node;
+ struct io_mapped_buf buf;
+};
+
struct ublk_rq_data {
struct llist_node node;
struct kref ref;
+
+ bool allocated_bvec;
+ struct uring_kbuf buf[0];
};
struct ublk_uring_cmd_pdu {
@@ -189,11 +199,15 @@ struct ublk_params_header {
__u32 types;
};
+static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
+ struct ublk_queue *ubq, int tag, size_t offset);
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
+static void ublk_io_buf_giveback_cb(struct io_rsrc_node *node);
+
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_USER_COPY;
@@ -588,6 +602,11 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
return ublk_support_user_copy(ubq);
}
+static inline bool ublk_support_zc(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
+}
+
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
struct request *req)
{
@@ -851,6 +870,74 @@ static size_t ublk_copy_user_pages(const struct request *req,
return done;
}
+/*
+ * The built command buffer is immutable, so it is fine to feed it to
+ * concurrent io_uring provide buf commands
+ */
+static int ublk_init_zero_copy_buffer(struct request *req)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ struct io_rsrc_node *node = &data->buf->node;
+ struct io_mapped_buf *imu = &data->buf->buf;
+ struct req_iterator rq_iter;
+ unsigned int nr_bvecs = 0;
+ struct bio_vec *bvec;
+ unsigned int offset;
+ struct bio_vec bv;
+
+ if (!ublk_rq_has_data(req))
+ goto exit;
+
+ rq_for_each_bvec(bv, req, rq_iter)
+ nr_bvecs++;
+
+ if (!nr_bvecs)
+ goto exit;
+
+ if (req->bio != req->biotail) {
+ int idx = 0;
+
+ bvec = kvmalloc_array(nr_bvecs, sizeof(struct bio_vec),
+ GFP_NOIO);
+ if (!bvec)
+ return -ENOMEM;
+
+ offset = 0;
+ rq_for_each_bvec(bv, req, rq_iter)
+ bvec[idx++] = bv;
+ data->allocated_bvec = true;
+ } else {
+ struct bio *bio = req->bio;
+
+ offset = bio->bi_iter.bi_bvec_done;
+ bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ }
+ imu->kbuf = 1;
+ imu->pbvec = bvec;
+ imu->nr_bvecs = nr_bvecs;
+ imu->offset = offset;
+ imu->len = blk_rq_bytes(req);
+ imu->dir = req_op(req) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE;
+ imu->kbuf_ack = ublk_io_buf_giveback_cb;
+ node->buf = imu;
+
+ return 0;
+exit:
+ imu->pbvec = NULL;
+ return 0;
+}
+
+static void ublk_deinit_zero_copy_buffer(struct request *req)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ struct io_mapped_buf *imu = &data->buf->buf;
+
+ if (data->allocated_bvec) {
+ kvfree(imu->pbvec);
+ data->allocated_bvec = false;
+ }
+}
+
static inline bool ublk_need_map_req(const struct request *req)
{
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
@@ -862,13 +949,25 @@ static inline bool ublk_need_unmap_req(const struct request *req)
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}
-static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
+static int ublk_map_io(const struct ublk_queue *ubq, struct request *req,
struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq))
+ if (ublk_support_user_copy(ubq)) {
+ if (ublk_support_zc(ubq)) {
+ int ret = ublk_init_zero_copy_buffer(req);
+
+ /*
+ * The only failure is -ENOMEM for allocating providing
+ * buffer command, return zero so that we can requeue
+ * this req.
+ */
+ if (unlikely(ret))
+ return 0;
+ }
return rq_bytes;
+ }
/*
* no zero copy, we delay copy WRITE request data into ublksrv
@@ -886,13 +985,16 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
}
static int ublk_unmap_io(const struct ublk_queue *ubq,
- const struct request *req,
+ struct request *req,
struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq))
+ if (ublk_support_user_copy(ubq)) {
+ if (ublk_support_zc(ubq))
+ ublk_deinit_zero_copy_buffer(req);
return rq_bytes;
+ }
if (ublk_need_unmap_req(req)) {
struct iov_iter iter;
@@ -1038,6 +1140,7 @@ static inline void __ublk_complete_rq(struct request *req)
return;
exit:
+ ublk_deinit_zero_copy_buffer(req);
blk_mq_end_request(req, res);
}
@@ -1680,6 +1783,46 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+static void ublk_io_buf_giveback_cb(struct io_rsrc_node *node)
+{
+ struct uring_kbuf *buf = (struct uring_kbuf *)node;
+ struct ublk_rq_data *data = container_of(buf, struct ublk_rq_data, buf[0]);
+ struct request *req = blk_mq_rq_from_pdu(data);
+ struct ublk_queue *ubq = req->mq_hctx->driver_data;
+
+ ublk_put_req_ref(ubq, req);
+}
+
+static int ublk_provide_io_buf(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag)
+{
+ struct ublk_device *ub = cmd->file->private_data;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ req = __ublk_check_and_get_req(ub, ubq, tag, 0);
+ if (!req)
+ return -EINVAL;
+
+ pr_devel("%s: qid %d tag %u request bytes %u\n",
+ __func__, tag, ubq->q_id, blk_rq_bytes(req));
+
+ data = blk_mq_rq_to_pdu(req);
+
+ /*
+ * io_uring guarantees that the callback will be called after
+ * the provided buffer is consumed, and it is automatic removal
+ * before this uring command is freed.
+ *
+ * This request won't be completed unless the callback is called,
+ * so ublk module won't be unloaded too.
+ */
+ return io_uring_cmd_lease_kbuf(cmd, &data->buf->node);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1731,6 +1874,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
+ case UBLK_IO_PROVIDE_IO_BUF:
+ if (unlikely(!ublk_support_zc(ubq)))
+ goto out;
+ return ublk_provide_io_buf(cmd, ubq, tag);
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
@@ -2149,11 +2296,14 @@ static void ublk_align_max_io_size(struct ublk_device *ub)
static int ublk_add_tag_set(struct ublk_device *ub)
{
+ int zc = !!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY);
+ struct ublk_rq_data *data;
+
ub->tag_set.ops = &ublk_mq_ops;
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
+ ub->tag_set.cmd_size = struct_size(data, buf, zc);
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
@@ -2458,8 +2608,12 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_free_dev_number;
}
- /* We are not ready to support zero copy */
- ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
+ /* zero copy depends on user copy */
+ if ((ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) &&
+ !ublk_dev_is_user_copy(ub)) {
+ ret = -EINVAL;
+ goto out_free_dev_number;
+ }
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 12873639ea96..04d73b349709 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -94,6 +94,8 @@
_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
#define UBLK_U_IO_NEED_GET_DATA \
_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+#define UBLK_U_IO_PROVIDE_IO_BUF \
+ _IOWR('u', 0x23, struct ublksrv_io_cmd)
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
@@ -127,9 +129,12 @@
#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
/*
- * zero copy requires 4k block size, and can remap ublk driver's io
- * request into ublksrv's vm space
- */
+ * io_uring provide kbuf command based zero copy
+ *
+ * Not available for UBLK_F_UNPRIVILEGED_DEV, because we rely on ublk
+ * server to fill up request buffer for READ IO, and ublk server can't
+ * be trusted in case of UBLK_F_UNPRIVILEGED_DEV.
+*/
#define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0)
/*
--
2.47.0
next prev parent reply other threads:[~2024-11-07 11:02 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-07 11:01 [PATCH V10 0/12] io_uring: support group buffer & ublk zc Ming Lei
2024-11-07 11:01 ` [PATCH V10 01/12] io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers Ming Lei
2024-11-07 11:01 ` [PATCH V10 02/12] io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' Ming Lei
2024-11-07 11:01 ` [PATCH V10 03/12] io_uring/rsrc: add & apply io_req_assign_buf_node() Ming Lei
2024-11-07 11:01 ` [PATCH V10 04/12] io_uring/rsrc: prepare for supporting external 'io_rsrc_node' Ming Lei
2024-11-07 11:01 ` [PATCH V10 05/12] io_uring: rename io_mapped_ubuf as io_mapped_buf Ming Lei
2024-11-07 11:01 ` [PATCH V10 06/12] io_uring: rename io_mapped_buf->ubuf as io_mapped_buf->addr Ming Lei
2024-11-07 11:01 ` [PATCH V10 07/12] io_uring: shrink io_mapped_buf Ming Lei
2024-11-07 11:01 ` [PATCH V10 08/12] io_uring: reuse io_mapped_buf for kernel buffer Ming Lei
2024-11-07 11:01 ` [PATCH V10 09/12] io_uring: add callback to 'io_mapped_buffer' for giving back " Ming Lei
2024-11-07 11:01 ` [PATCH V10 10/12] io_uring: support leased group buffer with REQ_F_GROUP_BUF Ming Lei
2024-11-07 11:01 ` [PATCH V10 11/12] io_uring/uring_cmd: support leasing device kernel buffer to io_uring Ming Lei
2024-11-07 18:27 ` kernel test robot
2024-11-07 19:30 ` kernel test robot
2024-11-08 0:59 ` Ming Lei
2024-11-07 11:01 ` Ming Lei [this message]
2024-11-07 22:25 ` (subset) [PATCH V10 0/12] io_uring: support group buffer & ublk zc Jens Axboe
2024-11-07 22:25 ` Jens Axboe
2024-11-12 0:53 ` Ming Lei
2024-11-13 13:43 ` Pavel Begunkov
2024-11-13 14:56 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox