public inbox for [email protected]
 help / color / mirror / Atom feed
From: Keith Busch <[email protected]>
To: <[email protected]>, <[email protected]>,
	<[email protected]>, <[email protected]>,
	<[email protected]>
Cc: <[email protected]>, <[email protected]>,
	<[email protected]>, Keith Busch <[email protected]>
Subject: [PATCHv6 0/6] ublk zero copy support
Date: Wed, 26 Feb 2025 10:09:51 -0800	[thread overview]
Message-ID: <[email protected]> (raw)

From: Keith Busch <[email protected]>

Changes from v5:

  Merged up to latest block for-next tree

  Fixed up the io_uring read/write fixed prep to not set do_import, and
  actually use the issue_flags when importing the buffer node (Pavel,
  Caleb).

  Used unambigious names for the read/write permissions of registered
  kernel vectors, defined them using their symbolic names instead of
  literals, and added a BUILD_BUG_ON to ensure the flags fits in the
  type (Ming, Pavel).

  Limit the io cache size to 64 elements (Pavel).

  Enforce unpriveledged ublk dev can't use zero copy (Ming).

  Various cleanups.

  Added reviews

Keith Busch (5):
  io_uring/rw: move fixed buffer import to issue path
  io_uring: add support for kernel registered bvecs
  ublk: zc register/unregister bvec
  io_uring: add abstraction for buf_table rsrc data
  io_uring: cache nodes and mapped buffers

Xinyu Zhang (1):
  nvme: map uring_cmd data even if address is 0

 drivers/block/ublk_drv.c       | 119 +++++++++-----
 drivers/nvme/host/ioctl.c      |   2 +-
 include/linux/io_uring/cmd.h   |   7 +
 include/linux/io_uring_types.h |  24 +--
 include/uapi/linux/ublk_cmd.h  |   4 +
 io_uring/fdinfo.c              |   8 +-
 io_uring/filetable.c           |   2 +-
 io_uring/io_uring.c            |   3 +
 io_uring/nop.c                 |   2 +-
 io_uring/opdef.c               |   4 +-
 io_uring/register.c            |   2 +-
 io_uring/rsrc.c                | 280 ++++++++++++++++++++++++++-------
 io_uring/rsrc.h                |  10 +-
 io_uring/rw.c                  |  39 +++--
 io_uring/rw.h                  |   2 +
 15 files changed, 389 insertions(+), 119 deletions(-)

-- 
2.43.5

ublksrv reference implmementation using links:

diff --git a/include/ublk_cmd.h b/include/ublk_cmd.h
index 0150003..07439be 100644
--- a/include/ublk_cmd.h
+++ b/include/ublk_cmd.h
@@ -94,6 +94,10 @@
 	_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
 #define	UBLK_U_IO_NEED_GET_DATA		\
 	_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+#define UBLK_U_IO_REGISTER_IO_BUF	\
+	_IOWR('u', 0x23, struct ublksrv_io_cmd)
+#define UBLK_U_IO_UNREGISTER_IO_BUF	\
+	_IOWR('u', 0x24, struct ublksrv_io_cmd)
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
diff --git a/include/ublksrv_tgt.h b/include/ublksrv_tgt.h
index 1deee2b..c331963 100644
--- a/include/ublksrv_tgt.h
+++ b/include/ublksrv_tgt.h
@@ -99,6 +99,7 @@ struct ublk_io_tgt {
 	co_handle_type co;
 	const struct io_uring_cqe *tgt_io_cqe;
 	int queued_tgt_io;	/* obsolete */
+	bool needs_unregister;
 };
 
 static inline struct ublk_io_tgt *__ublk_get_io_tgt_data(const struct ublk_io_data *io)
diff --git a/lib/ublksrv.c b/lib/ublksrv.c
index 16a9e13..7205247 100644
--- a/lib/ublksrv.c
+++ b/lib/ublksrv.c
@@ -619,6 +619,15 @@ skip_alloc_buf:
 		goto fail;
 	}
 
+	if (ctrl_dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+		if (ret) {
+			ublk_err("ublk dev %d queue %d register spare buffers failed %d",
+					q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret);
+			goto fail;
+		}
+	}
+
 	io_uring_register_ring_fd(&q->ring);
 
 	/*
diff --git a/tgt_loop.cpp b/tgt_loop.cpp
index 0f16676..91f8c81 100644
--- a/tgt_loop.cpp
+++ b/tgt_loop.cpp
@@ -246,12 +246,70 @@ static inline int loop_fallocate_mode(const struct ublksrv_io_desc *iod)
        return mode;
 }
 
+static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe,
+		int dev_fd, int tag, int q_id, __u64 index)
+{
+	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+
+	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
+	sqe->opcode		= IORING_OP_URING_CMD;
+	sqe->flags		|= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE;
+	sqe->cmd_op		= UBLK_U_IO_REGISTER_IO_BUF;
+
+	cmd->tag		= tag;
+	cmd->addr		= index;
+	cmd->q_id		= q_id;
+}
+
+static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe,
+		int dev_fd, int tag, int q_id, __u64 index)
+{
+	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+
+	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
+	sqe->opcode             = IORING_OP_URING_CMD;
+	sqe->flags              |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE;
+	sqe->cmd_op             = UBLK_U_IO_UNREGISTER_IO_BUF;
+
+	cmd->tag                = tag;
+	cmd->addr               = index;
+	cmd->q_id               = q_id;
+}
+
+static void loop_unregister(const struct ublksrv_queue *q, int tag)
+{
+	struct io_uring_sqe *sqe;
+
+	ublk_get_sqe_pair(q->ring_ptr, &sqe, NULL);
+	io_uring_prep_buf_unregister(sqe, 0, tag, q->q_id, tag);
+}
+
 static void loop_queue_tgt_read(const struct ublksrv_queue *q,
-		const struct ublksrv_io_desc *iod, int tag)
+		const struct ublk_io_data *data, int tag)
 {
+	struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
+	const struct ublksrv_io_desc *iod = data->iod;
+	const struct ublksrv_ctrl_dev_info *info =
+		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev));
 	unsigned ublk_op = ublksrv_get_op(iod);
 
-	if (user_copy) {
+	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		struct io_uring_sqe *reg;
+		struct io_uring_sqe *read;
+
+		ublk_get_sqe_pair(q->ring_ptr, &reg, &read);
+
+		io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag);
+
+		io_uring_prep_read_fixed(read, 1 /*fds[1]*/,
+			0,
+			iod->nr_sectors << 9,
+			iod->start_sector << 9,
+			tag);
+		io_uring_sqe_set_flags(read, IOSQE_FIXED_FILE);
+		read->user_data = build_user_data(tag, ublk_op, 0, 1);
+		io->needs_unregister = true;
+	} else if (user_copy) {
 		struct io_uring_sqe *sqe, *sqe2;
 		__u64 pos = ublk_pos(q->q_id, tag, 0);
 		void *buf = ublksrv_queue_get_io_buf(q, tag);
@@ -284,11 +342,31 @@ static void loop_queue_tgt_read(const struct ublksrv_queue *q,
 }
 
 static void loop_queue_tgt_write(const struct ublksrv_queue *q,
-		const struct ublksrv_io_desc *iod, int tag)
+		const struct ublk_io_data *data, int tag)
 {
+	const struct ublksrv_io_desc *iod = data->iod;
+	const struct ublksrv_ctrl_dev_info *info =
+		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev));
 	unsigned ublk_op = ublksrv_get_op(iod);
 
-	if (user_copy) {
+	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
+		struct io_uring_sqe *reg;
+		struct io_uring_sqe *write;
+
+		ublk_get_sqe_pair(q->ring_ptr, &reg, &write);
+		io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag);
+
+		io_uring_prep_write_fixed(write, 1 /*fds[1]*/,
+			0,
+			iod->nr_sectors << 9,
+			iod->start_sector << 9,
+			tag);
+		io_uring_sqe_set_flags(write, IOSQE_FIXED_FILE);
+		write->user_data = build_user_data(tag, ublk_op, 0, 1);
+
+		io->needs_unregister = true;
+	} else if (user_copy) {
 		struct io_uring_sqe *sqe, *sqe2;
 		__u64 pos = ublk_pos(q->q_id, tag, 0);
 		void *buf = ublksrv_queue_get_io_buf(q, tag);
@@ -352,10 +430,10 @@ static int loop_queue_tgt_io(const struct ublksrv_queue *q,
 		sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
 		break;
 	case UBLK_IO_OP_READ:
-		loop_queue_tgt_read(q, iod, tag);
+		loop_queue_tgt_read(q, data, tag);
 		break;
 	case UBLK_IO_OP_WRITE:
-		loop_queue_tgt_write(q, iod, tag);
+		loop_queue_tgt_write(q, data, tag);
 		break;
 	default:
 		return -EINVAL;
@@ -387,6 +465,10 @@ static co_io_job __loop_handle_io_async(const struct ublksrv_queue *q,
 		if (io->tgt_io_cqe->res == -EAGAIN)
 			goto again;
 
+		if (io->needs_unregister) {
+			io->needs_unregister = false;
+			loop_unregister(q, tag);
+		}
 		ublksrv_complete_io(q, tag, io->tgt_io_cqe->res);
 	} else if (ret < 0) {
 		ublk_err( "fail to queue io %d, ret %d\n", tag, tag);
diff --git a/ublksrv_tgt.cpp b/ublksrv_tgt.cpp
index 8f9cf28..f3ebe14 100644
--- a/ublksrv_tgt.cpp
+++ b/ublksrv_tgt.cpp
@@ -723,7 +723,7 @@ static int cmd_dev_add(int argc, char *argv[])
 			data.tgt_type = optarg;
 			break;
 		case 'z':
-			data.flags |= UBLK_F_SUPPORT_ZERO_COPY;
+			data.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
 			break;
 		case 'q':
 			data.nr_hw_queues = strtol(optarg, NULL, 10);

             reply	other threads:[~2025-02-26 18:10 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-26 18:09 Keith Busch [this message]
2025-02-26 18:09 ` [PATCHv6 1/5] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-26 18:09 ` [PATCHv6 1/6] io_uring/rw: move fixed buffer import to issue path Keith Busch
2025-02-26 18:09 ` [PATCHv6 2/6] io_uring: add support for kernel registered bvecs Keith Busch
2025-02-26 18:09 ` [PATCHv6 2/5] nvme: map uring_cmd data even if address is 0 Keith Busch
2025-02-26 18:09 ` [PATCHv6 3/6] " Keith Busch
2025-02-26 18:09 ` [PATCHv6 3/5] ublk: zc register/unregister bvec Keith Busch
2025-02-26 18:09 ` [PATCHv6 4/5] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-26 18:09 ` [PATCHv6 4/6] ublk: zc register/unregister bvec Keith Busch
2025-02-26 18:10 ` [PATCHv6 5/6] io_uring: add abstraction for buf_table rsrc data Keith Busch
2025-02-26 18:10 ` [PATCHv6 5/5] io_uring: cache nodes and mapped buffers Keith Busch
2025-02-26 18:10 ` [PATCHv6 6/6] " Keith Busch
2025-02-26 18:17 ` [PATCHv6 0/6] ublk zero copy support Keith Busch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox