From: Pavel Begunkov <[email protected]>
To: [email protected]
Cc: [email protected], [email protected]
Subject: [RFC 7/7] io_uring,fs: introduce IORING_OP_GET_BUF
Date: Sun, 30 Apr 2023 10:35:29 +0100 [thread overview]
Message-ID: <fc43826d510dc75de83d81161ca03e2688515686.1682701588.git.asml.silence@gmail.com> (raw)
In-Reply-To: <[email protected]>
There are several problems with splice requests, aka IORING_OP_SPLICE:
1) They are always executed by a worker thread, which is a slow path,
as we don't have any reliable way to execute it NOWAIT.
2) It can't easily poll for data, as there are 2 files it operates on.
It would either need to track what file to poll or poll both of them,
in both cases it'll be a mess and add lot of overhead.
3) It has to have pipes in the middle, which adds overhead and is not
great from the uapi design perspective when it goes for io_uring
requests.
4) We want to operate with spliced data as with a normal buffer, i.e.
write / send / etc. data as normally while it's zerocopy.
It can partially be solved, but the root cause is a suboptimal for
io_uring design of IORING_OP_SPLICE. Introduce a new request type
called IORING_OP_GET_BUF, inspired by splice(2) as well as other
proposals like fused requests. The main idea is to use io_uring's
registered buffers as the middle man instead of pipes. Once a buffer
is fetched / spliced from a file using a new fops callback
->iou_get_buf, it's installed as a registered buffers and can be used
by all operations supporting the feature.
Once the userspace releases the buffer, io_uring will wait for all
requests using the buffer to complete and then use a file provided
callback ->release() to return the buffer back. It operates on the
level of the entire buffer instead of individual pages like it's with
splice(2). As it was noted by the fused cmd work from where it came,
this approach should be more flexible and efficient, and also leaves
the space for more optimisations like custom caching or avoiding page
refcounting altogether.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/fs.h | 2 +
include/linux/io_uring.h | 11 +++++
include/uapi/linux/io_uring.h | 1 +
io_uring/opdef.c | 11 +++++
io_uring/rsrc.c | 2 +-
io_uring/rsrc.h | 2 +
io_uring/splice.c | 90 +++++++++++++++++++++++++++++++++++
io_uring/splice.h | 4 ++
8 files changed, 122 insertions(+), 1 deletion(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 475d88640d3d..a2528a39571f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1753,6 +1753,7 @@ struct dir_context {
struct iov_iter;
struct io_uring_cmd;
+struct iou_get_buf_info;
struct file_operations {
struct module *owner;
@@ -1798,6 +1799,7 @@ struct file_operations {
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
+ int (*iou_get_buf)(struct file *file, struct iou_get_buf_info *);
} __randomize_layout;
struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index e0e7df5beefc..9564db555bab 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -30,6 +30,17 @@ struct iou_buf_desc {
void *private;
};
+enum {
+ IOU_GET_BUF_F_NOWAIT = 1,
+};
+
+struct iou_get_buf_info {
+ loff_t off;
+ size_t len;
+ unsigned flags;
+ struct iou_buf_desc *desc;
+};
+
struct io_uring_cmd {
struct file *file;
const void *cmd;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0716cb17e436..b244215d03ad 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -223,6 +223,7 @@ enum io_uring_op {
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC,
+ IORING_OP_GET_BUF,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index cca7c5b55208..d3b7144c685a 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -428,6 +428,13 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_GET_BUF] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .prep = io_get_buf_prep,
+ .issue = io_get_buf,
+ },
};
@@ -648,6 +655,10 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail,
#endif
},
+ [IORING_OP_GET_BUF] = {
+ .name = "IORING_OP_GET_BUF",
+ .cleanup = io_get_buf_cleanup,
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index db4286b42dce..bdcd417bca87 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -89,7 +89,7 @@ static void io_put_reg_buf(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
kvfree(imu);
}
-static struct io_mapped_ubuf *io_alloc_reg_buf(struct io_ring_ctx *ctx,
+struct io_mapped_ubuf *io_alloc_reg_buf(struct io_ring_ctx *ctx,
int nr_bvecs)
{
struct io_cache_entry *entry;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index aba95bdd060e..6aaf7acb60c5 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -177,4 +177,6 @@ static inline void io_reg_buf_release(struct io_mapped_ubuf *imu)
imu->desc.release(&imu->desc);
}
+struct io_mapped_ubuf *io_alloc_reg_buf(struct io_ring_ctx *ctx, int nr_bvecs);
+
#endif
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 2a4bbb719531..3d50334caec5 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -13,6 +13,7 @@
#include "io_uring.h"
#include "splice.h"
+#include "rsrc.h"
struct io_splice {
struct file *file_out;
@@ -119,3 +120,92 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+struct io_get_buf {
+ struct file *file;
+ struct io_mapped_ubuf *imu;
+ int max_pages;
+ loff_t off;
+ u64 len;
+};
+
+void io_get_buf_cleanup(struct io_kiocb *req)
+{
+ struct io_get_buf *gb = io_kiocb_to_cmd(req, struct io_get_buf);
+ struct io_mapped_ubuf *imu = gb->imu;
+
+ if (!imu)
+ return;
+ if (imu->desc.nr_bvecs && !WARN_ON_ONCE(!imu->desc.release))
+ io_reg_buf_release(imu);
+
+ kvfree(imu);
+ gb->imu = NULL;
+}
+
+int io_get_buf_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_get_buf *gb = io_kiocb_to_cmd(req, struct io_get_buf);
+ struct io_mapped_ubuf *imu;
+ int nr_pages;
+
+ if (unlikely(sqe->splice_flags || sqe->splice_fd_in || sqe->ioprio ||
+ sqe->addr || sqe->addr3))
+ return -EINVAL;
+
+ req->buf_index = READ_ONCE(sqe->buf_index);
+ gb->len = READ_ONCE(sqe->len);
+ gb->off = READ_ONCE(sqe->off);
+ nr_pages = (gb->len >> PAGE_SHIFT) + 2;
+ gb->max_pages = nr_pages;
+
+ gb->imu = imu = io_alloc_reg_buf(req->ctx, nr_pages);
+ if (!imu)
+ return -ENOMEM;
+ imu->desc.nr_bvecs = 0;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+int io_get_buf(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_get_buf *gb = io_kiocb_to_cmd(req, struct io_get_buf);
+ struct io_mapped_ubuf *imu = gb->imu;
+ struct iou_get_buf_info bi;
+ int ret, err;
+
+ bi.off = gb->off;
+ bi.len = gb->len;
+ bi.flags = (issue_flags & IO_URING_F_NONBLOCK) ? IOU_GET_BUF_F_NOWAIT : 0;
+ bi.desc = &imu->desc;
+
+ if (!gb->file->f_op->iou_get_buf)
+ return -ENOTSUPP;
+ ret = gb->file->f_op->iou_get_buf(gb->file, &bi);
+ if (ret < 0) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ goto done;
+ }
+
+ imu->ubuf = 0;
+ imu->ubuf_end = ret;
+ imu->dir_mask = 1U << ITER_SOURCE;
+ imu->acct_pages = 0;
+
+ io_ring_submit_lock(req->ctx, issue_flags);
+ err = io_install_buffer(req->ctx, imu, req->buf_index);
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ if (unlikely(err)) {
+ ret = err;
+ goto done;
+ }
+
+ gb->imu = NULL;
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+done:
+ if (ret != gb->len)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/splice.h b/io_uring/splice.h
index 542f94168ad3..2b923fc2bbf1 100644
--- a/io_uring/splice.h
+++ b/io_uring/splice.h
@@ -5,3 +5,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags);
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_splice(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_get_buf_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_get_buf(struct io_kiocb *req, unsigned int issue_flags);
+void io_get_buf_cleanup(struct io_kiocb *req);
--
2.40.0
next prev parent reply other threads:[~2023-04-30 9:37 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-04-30 9:35 [RFC 0/7] Rethinking splice Pavel Begunkov
2023-04-30 9:35 ` [RFC 1/7] io_uring: add io_mapped_ubuf caches Pavel Begunkov
2023-04-30 9:35 ` [RFC 2/7] io_uring: add reg-buffer data directions Pavel Begunkov
2023-04-30 9:35 ` [RFC 3/7] io_uring: fail loop_rw_iter with pure bvec bufs Pavel Begunkov
2023-04-30 9:35 ` [RFC 4/7] io_uring/rsrc: introduce struct iou_buf_desc Pavel Begunkov
2023-04-30 9:35 ` [RFC 5/7] io_uring/rsrc: add buffer release callbacks Pavel Begunkov
2023-04-30 9:35 ` [RFC 6/7] io_uring/rsrc: introduce helper installing one buffer Pavel Begunkov
2023-04-30 9:35 ` Pavel Begunkov [this message]
2023-05-02 14:57 ` [RFC 7/7] io_uring,fs: introduce IORING_OP_GET_BUF Ming Lei
2023-05-02 15:20 ` Ming Lei
2023-05-03 14:54 ` Pavel Begunkov
2023-05-04 2:06 ` Ming Lei
2023-05-08 2:30 ` Pavel Begunkov
2023-05-17 4:05 ` Ming Lei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=fc43826d510dc75de83d81161ca03e2688515686.1682701588.git.asml.silence@gmail.com \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox