public inbox for [email protected]
 help / color / mirror / Atom feed
From: Ming Lei <[email protected]>
To: Jens Axboe <[email protected]>,
	[email protected], [email protected],
	[email protected],
	Alexander Viro <[email protected]>
Cc: Stefan Hajnoczi <[email protected]>,
	Miklos Szeredi <[email protected]>,
	Bernd Schubert <[email protected]>,
	Nitesh Shetty <[email protected]>,
	Christoph Hellwig <[email protected]>,
	Ziyang Zhang <[email protected]>,
	Ming Lei <[email protected]>
Subject: [PATCH 3/4] io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF
Date: Fri, 10 Feb 2023 23:32:11 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

IORING_OP_READ_SPLICE_BUF: read to buffer which is built from
->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
for building buffer.

IORING_OP_WRITE_SPLICE_BUF: write from buffer which is built from
->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
for building buffer.

The typical use case is for supporting ublk/fuse io_uring zero copy,
and READ/WRITE OP retrieves ublk/fuse request buffer via direct pipe
from device->read_splice(), then READ/WRITE can be done to/from this
buffer directly.

Signed-off-by: Ming Lei <[email protected]>
---
 include/uapi/linux/io_uring.h |   2 +
 io_uring/opdef.c              |  37 ++++++++
 io_uring/rw.c                 | 174 +++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 |   1 +
 4 files changed, 213 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 636a4c2c1294..bada0c91a350 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -223,6 +223,8 @@ enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_READ_SPLICE_BUF,
+	IORING_OP_WRITE_SPLICE_BUF,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 5238ecd7af6a..91e8d8f96134 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -427,6 +427,31 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READ_SPLICE_BUF] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITE_SPLICE_BUF] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
 };
 
 
@@ -647,6 +672,18 @@ const struct io_cold_def io_cold_defs[] = {
 		.fail			= io_sendrecv_fail,
 #endif
 	},
+	[IORING_OP_READ_SPLICE_BUF] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ_TO_SPLICE_BUF",
+		.cleanup		= io_read_write_cleanup,
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITE_SPLICE_BUF] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE_FROM_SPICE_BUF",
+		.cleanup		= io_read_write_cleanup,
+		.fail			= io_rw_fail,
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index efe6bfda9ca9..381514fd1bc5 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -73,6 +73,175 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req)
 	return 0;
 }
 
+struct io_rw_splice_buf_data {
+	unsigned long total;
+	unsigned int  max_bvecs;
+	struct io_mapped_ubuf **imu;
+};
+
+/* the max size of whole 'io_mapped_ubuf' allocation is one page */
+static inline unsigned int io_rw_max_splice_buf_bvecs(void)
+{
+	return (PAGE_SIZE - sizeof(struct io_mapped_ubuf)) /
+			sizeof(struct bio_vec);
+}
+
+static inline unsigned int io_rw_splice_buf_nr_bvecs(unsigned long len)
+{
+	return min_t(unsigned int, (len + PAGE_SIZE - 1) >> PAGE_SHIFT,
+			io_rw_max_splice_buf_bvecs());
+}
+
+static inline bool io_rw_splice_buf(struct io_kiocb *req)
+{
+	return req->opcode == IORING_OP_READ_SPLICE_BUF ||
+		req->opcode == IORING_OP_WRITE_SPLICE_BUF;
+}
+
+static void io_rw_cleanup_splice_buf(struct io_kiocb *req)
+{
+	struct io_mapped_ubuf *imu = req->imu;
+	int i;
+
+	if (!imu)
+		return;
+
+	for (i = 0; i < imu->nr_bvecs; i++)
+		put_page(imu->bvec[i].bv_page);
+
+	req->imu = NULL;
+	kfree(imu);
+}
+
+static int io_splice_buf_actor(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf,
+			       struct splice_desc *sd)
+{
+	struct io_rw_splice_buf_data *data = sd->u.data;
+	struct io_mapped_ubuf *imu = *data->imu;
+	struct bio_vec *bvec;
+
+	if (imu->nr_bvecs >= data->max_bvecs) {
+		/*
+		 * Double bvec allocation given we don't know
+		 * how many remains
+		 */
+		unsigned nr_bvecs = min(data->max_bvecs * 2,
+				io_rw_max_splice_buf_bvecs());
+		struct io_mapped_ubuf *new_imu;
+
+		/* can't grow, given up */
+		if (nr_bvecs <= data->max_bvecs)
+			return 0;
+
+		new_imu = krealloc(imu, struct_size(imu, bvec, nr_bvecs),
+				GFP_KERNEL);
+		if (!new_imu)
+			return -ENOMEM;
+		imu = new_imu;
+		data->max_bvecs = nr_bvecs;
+		*data->imu = imu;
+	}
+
+	if (!try_get_page(buf->page))
+		return -EINVAL;
+
+	bvec = &imu->bvec[imu->nr_bvecs];
+	bvec->bv_page = buf->page;
+	bvec->bv_offset = buf->offset;
+	bvec->bv_len = buf->len;
+	imu->nr_bvecs++;
+	data->total += buf->len;
+
+	return buf->len;
+}
+
+static int io_splice_buf_direct_actor(struct pipe_inode_info *pipe,
+			       struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, io_splice_buf_actor);
+}
+
+static int __io_prep_rw_splice_buf(struct io_kiocb *req,
+				   struct io_rw_splice_buf_data *data,
+				   struct file *splice_f,
+				   size_t len,
+				   loff_t splice_off)
+{
+	unsigned flags = req->opcode == IORING_OP_READ_SPLICE_BUF ?
+			SPLICE_F_KERN_FOR_READ : SPLICE_F_KERN_FOR_WRITE;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags | SPLICE_F_NONBLOCK | SPLICE_F_KERN_NEED_CONFIRM,
+		.pos = splice_off,
+		.u.data = data,
+		.ignore_sig = true,
+	};
+
+	return splice_direct_to_actor(splice_f, &sd,
+			io_splice_buf_direct_actor);
+}
+
+static int io_prep_rw_splice_buf(struct io_kiocb *req,
+				 const struct io_uring_sqe *sqe)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	unsigned nr_pages = io_rw_splice_buf_nr_bvecs(rw->len);
+	loff_t splice_off = READ_ONCE(sqe->splice_off_in);
+	struct io_rw_splice_buf_data data;
+	struct io_mapped_ubuf *imu;
+	struct fd splice_fd;
+	int ret;
+
+	splice_fd = fdget(READ_ONCE(sqe->splice_fd_in));
+	if (!splice_fd.file)
+		return -EBADF;
+
+	ret = -EBADF;
+	if (!(splice_fd.file->f_mode & FMODE_READ))
+		goto out_put_fd;
+
+	ret = -ENOMEM;
+	imu = kmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	if (!imu)
+		goto out_put_fd;
+
+	/* splice buffer actually hasn't virtual address */
+	imu->nr_bvecs = 0;
+
+	data.max_bvecs = nr_pages;
+	data.total = 0;
+	data.imu = &imu;
+
+	rw->addr = 0;
+	req->flags |= REQ_F_NEED_CLEANUP;
+
+	ret = __io_prep_rw_splice_buf(req, &data, splice_fd.file, rw->len,
+			splice_off);
+	imu = *data.imu;
+	imu->acct_pages = 0;
+	imu->ubuf = 0;
+	imu->ubuf_end = data.total;
+	rw->len = data.total;
+	req->imu = imu;
+	if (!data.total) {
+		io_rw_cleanup_splice_buf(req);
+	} else  {
+		ret = 0;
+	}
+out_put_fd:
+	if (splice_fd.file)
+		fdput(splice_fd);
+
+	return ret;
+}
+
+void io_read_write_cleanup(struct io_kiocb *req)
+{
+	if (io_rw_splice_buf(req))
+		io_rw_cleanup_splice_buf(req);
+}
+
 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -117,6 +286,8 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		ret = io_iov_buffer_select_prep(req);
 		if (ret)
 			return ret;
+	} else if (io_rw_splice_buf(req)) {
+		return io_prep_rw_splice_buf(req, sqe);
 	}
 
 	return 0;
@@ -371,7 +542,8 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	size_t sqe_len;
 	ssize_t ret;
 
-	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
+	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED ||
+			io_rw_splice_buf(req)) {
 		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
 		if (ret)
 			return ERR_PTR(ret);
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3b733f4b610a..b37d6f6ecb6a 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -21,4 +21,5 @@ int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
 int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
+void io_read_write_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);
-- 
2.31.1


  parent reply	other threads:[~2023-02-10 15:33 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-10 15:32 [PATCH 0/4] io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF Ming Lei
2023-02-10 15:32 ` [PATCH 1/4] fs/splice: enhance direct pipe & splice for moving pages in kernel Ming Lei
2023-02-11 15:42   ` Ming Lei
2023-02-11 18:57     ` Linus Torvalds
2023-02-12  1:39       ` Ming Lei
2023-02-13 20:04         ` Linus Torvalds
2023-02-14  0:52           ` Ming Lei
2023-02-14  2:35             ` Ming Lei
2023-02-14 11:03           ` Miklos Szeredi
2023-02-14 14:35             ` Ming Lei
2023-02-14 15:39               ` Miklos Szeredi
2023-02-15  0:11                 ` Ming Lei
2023-02-15 10:36                   ` Miklos Szeredi
2023-02-10 15:32 ` [PATCH 2/4] fs/splice: allow to ignore signal in __splice_from_pipe Ming Lei
2023-02-10 15:32 ` Ming Lei [this message]
2023-02-11 15:45   ` [PATCH 3/4] io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF Jens Axboe
2023-02-11 16:12     ` Ming Lei
2023-02-11 16:52       ` Jens Axboe
2023-02-12  3:22         ` Ming Lei
2023-02-12  3:55           ` Jens Axboe
2023-02-13  1:06             ` Ming Lei
2023-02-11 17:13   ` Jens Axboe
2023-02-12  1:48     ` Ming Lei
2023-02-12  2:42       ` Jens Axboe
2023-02-10 15:32 ` [PATCH 4/4] ublk_drv: support splice based read/write zero copy Ming Lei
2023-02-10 21:54 ` [PATCH 0/4] io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF Jens Axboe
2023-02-10 22:19   ` Jens Axboe
2023-02-11  5:13   ` Ming Lei
2023-02-11 15:45     ` Jens Axboe
2023-02-14 16:36 ` Stefan Hajnoczi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox