From: Ming Lei <[email protected]>
To: Jens Axboe <[email protected]>, [email protected]
Cc: [email protected], Miklos Szeredi <[email protected]>,
ZiyangZhang <[email protected]>,
Xiaoguang Wang <[email protected]>,
Bernd Schubert <[email protected]>,
Ming Lei <[email protected]>
Subject: [RFC PATCH 12/12] block: ublk_drv: apply io_uring FUSED_CMD for supporting zero copy
Date: Wed, 1 Mar 2023 22:06:11 +0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
Apply io_uring fused command for supporting zero copy:
1) init the fused cmd buffer(io_mapped_buf) in ublk_map_io(),
and deinit it in ublk_unmap_io(), and this buffer is immutable,
so it is just fine to retrieve it from concurrent fused command.
1) add sub-command opcode of UBLK_IO_FUSED_SUBMIT_IO for retrieving
this fused cmd(zero copy) buffer
2) call io_fused_cmd_provide_kbuf() to provide buffer to slave
request; meantime setup complete callback via this API, once
slave request is completed, the complete callback is called
for freeing the buffer and completing the uring fused command
Todo: don't complete ublk block request until all in-flight fused
commands aiming this request are completed; this change requires
to clean up current ublk driver a bit, so delay this work in future
post, and it won't affect reviewing on this whole approach.
Signed-off-by: Ming Lei <[email protected]>
---
drivers/block/ublk_drv.c | 167 ++++++++++++++++++++++++++++++++--
include/uapi/linux/ublk_cmd.h | 1 +
2 files changed, 160 insertions(+), 8 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b9e38ebabca7..56a362798aa7 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -62,6 +62,8 @@
struct ublk_rq_data {
struct llist_node node;
struct callback_head work;
+ bool allocated_bvec;
+ struct io_mapped_buf buf[0];
};
struct ublk_uring_cmd_pdu {
@@ -525,10 +527,87 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data,
return done;
}
-static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
+/*
+ * The built command buffer is immutable, so it is fine to feed it to
+ * concurrent io_uring fused commands
+ */
+static int ublk_init_zero_copy_buffer(struct request *rq)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+ struct io_mapped_buf *imu = data->buf;
+ struct req_iterator rq_iter;
+ unsigned int nr_bvecs = 0;
+ struct bio_vec *bvec;
+ unsigned int offset;
+ struct bio_vec bv;
+
+ if (!ublk_rq_has_data(rq))
+ goto exit;
+
+ rq_for_each_bvec(bv, rq, rq_iter)
+ nr_bvecs++;
+
+ if (!nr_bvecs)
+ goto exit;
+
+ if (rq->bio != rq->biotail) {
+ int idx = 0;
+
+ bvec = kvmalloc_array(sizeof(struct bio_vec), nr_bvecs,
+ GFP_NOIO);
+ if (!bvec)
+ return -ENOMEM;
+
+ offset = 0;
+ rq_for_each_bvec(bv, rq, rq_iter)
+ bvec[idx++] = bv;
+ data->allocated_bvec = true;
+ } else {
+ struct bio *bio = rq->bio;
+
+ offset = bio->bi_iter.bi_bvec_done;
+ bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ }
+ imu->bvec = bvec;
+ imu->nr_bvecs = nr_bvecs;
+ imu->offset = offset;
+ imu->buf = 0;
+ imu->buf_end = blk_rq_bytes(rq);
+
+ return 0;
+exit:
+ imu->bvec = NULL;
+ return 0;
+}
+
+static void ublk_deinit_zero_copy_buffer(struct request *rq)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+ struct io_mapped_buf *imu = data->buf;
+
+ if (data->allocated_bvec) {
+ kvfree(imu->bvec);
+ data->allocated_bvec = false;
+ }
+}
+
+static int ublk_map_io(const struct ublk_queue *ubq, struct request *req,
struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+
+ if (ubq->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+ int ret = ublk_init_zero_copy_buffer(req);
+
+ /*
+ * The only failure is -ENOMEM for allocating fused cmd
+ * buffer, return zero so that we can requeue this req.
+ */
+ if (unlikely(ret))
+ return 0;
+ return rq_bytes;
+ }
+
/*
* no zero copy, we delay copy WRITE request data into ublksrv
* context and the big benefit is that pinning pages in current
@@ -553,11 +632,17 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
}
static int ublk_unmap_io(const struct ublk_queue *ubq,
- const struct request *req,
+ struct request *req,
struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ubq->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+ ublk_deinit_zero_copy_buffer(req);
+
+ return rq_bytes;
+ }
+
if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
struct ublk_map_data data = {
.ubq = ubq,
@@ -693,6 +778,7 @@ static void ublk_complete_rq(struct request *req)
return;
exit:
+ ublk_deinit_zero_copy_buffer(req);
blk_mq_end_request(req, res);
}
@@ -1259,6 +1345,66 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
ublk_queue_cmd(ubq, req);
}
+static inline bool ublk_check_fused_buf_dir(const struct request *req,
+ unsigned int flags)
+{
+ flags &= IO_URING_F_FUSED;
+
+ if (req_op(req) == REQ_OP_READ && flags == IO_URING_F_FUSED_WRITE)
+ return true;
+
+ if (req_op(req) == REQ_OP_WRITE && flags == IO_URING_F_FUSED_READ)
+ return true;
+
+ return false;
+}
+
+static void ublk_fused_cmd_done_cb(struct io_uring_cmd *cmd)
+{
+ io_uring_cmd_done(cmd, cmd->fused.data.slave_res, 0);
+}
+
+static int ublk_handle_fused_cmd(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag, unsigned int issue_flags)
+{
+ struct ublk_device *ub = cmd->file->private_data;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ if (!(issue_flags & IO_URING_F_FUSED))
+ goto exit;
+
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto exit;
+
+ if (!(ubq->flags & UBLK_F_SUPPORT_ZERO_COPY))
+ goto exit;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req || !blk_mq_request_started(req))
+ goto exit;
+
+ pr_devel("%s: qid %d tag %u request bytes %u, issue flags %x\n",
+ __func__, tag, ubq->q_id, blk_rq_bytes(req),
+ issue_flags);
+
+ if (!ublk_check_fused_buf_dir(req, issue_flags))
+ goto exit;
+
+ if (!ublk_rq_has_data(req))
+ goto exit;
+
+ data = blk_mq_rq_to_pdu(req);
+ io_fused_cmd_provide_kbuf(cmd, !(issue_flags & IO_URING_F_UNLOCKED),
+ data->buf, ublk_fused_cmd_done_cb);
+ return -EIOCBQUEUED;
+exit:
+ return -EINVAL;
+}
+
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@@ -1277,7 +1423,8 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (!(issue_flags & IO_URING_F_SQE128))
goto out;
- if (issue_flags & IO_URING_F_FUSED)
+ if ((issue_flags & IO_URING_F_FUSED) &&
+ cmd_op != UBLK_IO_FUSED_SUBMIT_IO)
return -EOPNOTSUPP;
if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
@@ -1287,7 +1434,8 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (!ubq || ub_cmd->q_id != ubq->q_id)
goto out;
- if (ubq->ubq_daemon && ubq->ubq_daemon != current)
+ if ((ubq->ubq_daemon && ubq->ubq_daemon != current) &&
+ (cmd_op != UBLK_IO_FUSED_SUBMIT_IO))
goto out;
if (tag >= ubq->q_depth)
@@ -1310,6 +1458,9 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
goto out;
switch (cmd_op) {
+ case UBLK_IO_FUSED_SUBMIT_IO:
+ return ublk_handle_fused_cmd(cmd, ubq, tag, issue_flags);
+
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
@@ -1533,11 +1684,14 @@ static void ublk_align_max_io_size(struct ublk_device *ub)
static int ublk_add_tag_set(struct ublk_device *ub)
{
+ int zc = !!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY);
+ struct ublk_rq_data *data;
+
ub->tag_set.ops = &ublk_mq_ops;
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
+ ub->tag_set.cmd_size = struct_size(data, buf, zc);
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
@@ -1756,9 +1910,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
- /* We are not ready to support zero copy */
- ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
-
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index f6238ccc7800..027e60e49cc8 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -44,6 +44,7 @@
#define UBLK_IO_FETCH_REQ 0x20
#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21
#define UBLK_IO_NEED_GET_DATA 0x22
+#define UBLK_IO_FUSED_SUBMIT_IO 0x23
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
--
2.31.1
next prev parent reply other threads:[~2023-03-01 14:09 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-03-01 14:05 [RFC PATCH 00/12] io_uring: add IORING_OP_FUSED_CMD Ming Lei
2023-03-01 14:06 ` [RFC PATCH 01/12] io_uring: increase io_kiocb->flags into 64bit Ming Lei
2023-03-01 14:06 ` [RFC PATCH 02/12] io_uring: define io_mapped_ubuf->acct_pages as unsigned integer Ming Lei
2023-03-01 14:06 ` [RFC PATCH 03/12] io_uring: extend io_mapped_ubuf to cover external bvec table Ming Lei
2023-03-01 14:06 ` [RFC PATCH 04/12] io_uring: rename io_mapped_ubuf as io_mapped_buf Ming Lei
2023-03-01 14:06 ` [RFC PATCH 05/12] io_uring: export 'struct io_mapped_buf' for fused cmd buffer Ming Lei
2023-03-01 14:06 ` [RFC PATCH 06/12] io_uring: add IO_URING_F_FUSED and prepare for supporting OP_FUSED_CMD Ming Lei
2023-03-01 14:06 ` [RFC PATCH 07/12] io_uring: add IORING_OP_FUSED_CMD Ming Lei
2023-03-01 14:06 ` [RFC PATCH 08/12] io_uring: support OP_READ/OP_WRITE for fused slave request Ming Lei
2023-03-01 14:06 ` [RFC PATCH 09/12] io_uring: support OP_SEND_ZC/OP_RECV " Ming Lei
2023-03-01 14:06 ` [RFC PATCH 10/12] block: ublk_drv: mark device as LIVE before adding disk Ming Lei
2023-03-03 2:25 ` Ziyang Zhang
2023-03-01 14:06 ` [RFC PATCH 11/12] block: ublk_drv: add common exit handling Ming Lei
2023-03-03 2:15 ` Ziyang Zhang
2023-03-01 14:06 ` Ming Lei [this message]
2023-03-03 2:52 ` [RFC PATCH 00/12] io_uring: add IORING_OP_FUSED_CMD Ziyang Zhang
2023-03-03 3:01 ` Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox