From: Kanchan Joshi <[email protected]>
To: [email protected], [email protected], [email protected],
[email protected]
Cc: [email protected], [email protected],
[email protected], Anuj Gupta <[email protected]>,
Kanchan Joshi <[email protected]>,
Nitesh Shetty <[email protected]>
Subject: [RFC PATCH 2/4] io_uring/rw: support read/write with metadata
Date: Sat, 23 Mar 2024 00:20:21 +0530 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Anuj Gupta <[email protected]>
This patch introduces IORING_OP_READ_META and IORING_OP_WRITE_META
opcodes which allow sending a meta buffer along with read/write.
Application can do that by using the newly added meta_buf and meta-len
fields of the SQE.
These opcodes are supported only for direct IO.
Signed-off-by: Anuj Gupta <[email protected]>
Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Nitesh Shetty <[email protected]>
---
include/linux/fs.h | 1 +
include/uapi/linux/io_uring.h | 6 +++
io_uring/io_uring.c | 2 +
io_uring/opdef.c | 29 ++++++++++++
io_uring/rw.c | 86 +++++++++++++++++++++++++++++++++--
io_uring/rw.h | 8 ++++
6 files changed, 129 insertions(+), 3 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0a22b7245982..c3a483a4fdac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -327,6 +327,7 @@ struct readahead_control;
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
+#define IOCB_USE_META (1 << 22)
/*
* IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
* iocb completion can be passed back to the owner for execution from a safe
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02b..87bd44098037 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -97,6 +97,10 @@ struct io_uring_sqe {
__u64 addr3;
__u64 __pad2[1];
};
+ struct {
+ __u64 meta_addr;
+ __u32 meta_len;
+ };
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
@@ -256,6 +260,8 @@ enum io_uring_op {
IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL,
IORING_OP_FTRUNCATE,
+ IORING_OP_READ_META,
+ IORING_OP_WRITE_META,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 49a124daa359..7c380cac4465 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4134,7 +4134,9 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
+ BUILD_BUG_SQE_ELEM(48, __u64, meta_addr);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+ BUILD_BUG_SQE_ELEM(56, __u32, meta_len);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9c080aadc5a6..cb31573ac4ad 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -146,6 +146,26 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READ_META] = {
+ .needs_file = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .prep = io_prep_rw_meta,
+ .issue = io_rw_meta,
+ },
+ [IORING_OP_WRITE_META] = {
+ .needs_file = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .prep = io_prep_rw_meta,
+ .issue = io_rw_meta,
+ },
[IORING_OP_RECVMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -501,6 +521,15 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
+ [IORING_OP_READ_META] = {
+ .async_size = sizeof(struct io_async_rw),
+ .name = "READ_META",
+ .fail = io_rw_fail,
+ },
+ [IORING_OP_WRITE_META] = {
+ .async_size = sizeof(struct io_async_rw),
+ .name = "WRITE_META",
+ },
[IORING_OP_FSYNC] = {
.name = "FSYNC",
},
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 40f6c2a59928..87a6304052f0 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -27,6 +27,7 @@ struct io_rw {
struct kiocb kiocb;
u64 addr;
u32 len;
+ u32 meta_len;
};
static inline bool io_file_supports_nowait(struct io_kiocb *req)
@@ -107,6 +108,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
+int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct kiocb *kiocb = &rw->kiocb;
+ int ret;
+
+ ret = io_prep_rw(req, sqe);
+ if (unlikely(ret))
+ return ret;
+ kiocb->private = u64_to_user_ptr(READ_ONCE(sqe->meta_addr));
+ rw->meta_len = READ_ONCE(sqe->meta_len);
+
+ kiocb->ki_flags |= IOCB_USE_META;
+ return 0;
+}
+
int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
@@ -571,9 +588,18 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
}
}
+static inline void io_req_map_meta(struct io_async_rw *iorw, struct io_rw_state_meta *sm)
+{
+ memcpy(&iorw->s_meta.iter_meta, &sm->iter_meta, sizeof(struct iov_iter));
+ iov_iter_save_state(&iorw->s_meta.iter_meta, &iorw->s_meta.iter_state_meta);
+}
+
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
struct io_rw_state *s, bool force)
{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct kiocb *kiocb = &rw->kiocb;
+
if (!force && !io_cold_defs[req->opcode].prep_async)
return 0;
/* opcode type doesn't need async data */
@@ -591,6 +617,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
+ if (unlikely(kiocb->ki_flags & IOCB_USE_META)) {
+ struct io_rw_state_meta *sm = kiocb->private;
+
+ io_req_map_meta(iorw, sm);
+ }
}
return 0;
}
@@ -747,7 +778,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
- kiocb->private = NULL;
+ if (likely(!(kiocb->ki_flags & IOCB_USE_META)))
+ kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
@@ -766,6 +798,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
struct io_rw_state __s, *s = &__s;
struct iovec *iovec;
struct kiocb *kiocb = &rw->kiocb;
+ struct io_rw_state_meta *sm = kiocb->private;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_async_rw *io;
ssize_t ret, ret2;
@@ -840,13 +873,16 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
goto done;
+ if (kiocb->ki_flags & IOCB_USE_META)
+ kiocb->private = sm;
ret = 0;
} else if (ret == -EIOCBQUEUED) {
if (iovec)
kfree(iovec);
return IOU_ISSUE_SKIP_COMPLETE;
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
+ (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
+ (kiocb->ki_flags & IOCB_USE_META)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
@@ -857,6 +893,12 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* manually if we need to.
*/
iov_iter_restore(&s->iter, &s->iter_state);
+ if (unlikely(kiocb->ki_flags & IOCB_USE_META)) {
+ /* don't handle partial completion for read + meta */
+ if (ret > 0)
+ goto done;
+ iov_iter_restore(&sm->iter_meta, &sm->iter_state_meta);
+ }
ret2 = io_setup_async_rw(req, iovec, s, true);
iovec = NULL;
@@ -1070,7 +1112,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
- if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
+ if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)
+ && !(kiocb->ki_flags & IOCB_USE_META)) {
struct io_async_rw *io;
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
@@ -1111,6 +1154,43 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
+int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ void __user *meta_addr = u64_to_user_ptr((u64)rw->kiocb.private);
+ struct io_rw_state_meta __sm, *sm = &__sm;
+ struct kiocb *kiocb = &rw->kiocb;
+ int ret;
+
+ if (!(req->file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+ /* prepare iter for meta-buffer */
+ if (!req_has_async_data(req)) {
+ ret = import_ubuf(ITER_SOURCE, meta_addr, rw->meta_len, &sm->iter_meta);
+ iov_iter_save_state(&sm->iter_meta, &sm->iter_state_meta);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ struct io_async_rw *io = req->async_data;
+
+ sm = &io->s_meta;
+ iov_iter_restore(&sm->iter_meta, &sm->iter_state_meta);
+ }
+ /* Store iter for meta-buf in private, will be used later*/
+ kiocb->private = sm;
+ if (req->opcode == IORING_OP_READ_META) {
+ ret = __io_read(req, issue_flags);
+ if (ret >= 0)
+ return kiocb_done(req, ret, issue_flags);
+ } else {
+ ret = io_write(req, issue_flags);
+ }
+ if (ret == -EAGAIN)
+ kiocb->private = meta_addr;
+ return ret;
+
+}
+
void io_rw_fail(struct io_kiocb *req)
{
int res;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index f9e89b4fe4da..7c12216776bc 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -8,19 +8,27 @@ struct io_rw_state {
struct iovec fast_iov[UIO_FASTIOV];
};
+struct io_rw_state_meta {
+ struct iov_iter iter_meta;
+ struct iov_iter_state iter_state_meta;
+};
+
struct io_async_rw {
struct io_rw_state s;
+ struct io_rw_state_meta s_meta;
const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read(struct io_kiocb *req, unsigned int issue_flags);
int io_readv_prep_async(struct io_kiocb *req);
int io_write(struct io_kiocb *req, unsigned int issue_flags);
+int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags);
int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
--
2.25.1
next prev parent reply other threads:[~2024-03-22 18:57 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20240322185729epcas5p350c5054b5b519a6aa9d1b35ba3709563@epcas5p3.samsung.com>
2024-03-22 18:50 ` [RFC PATCH 0/4] Read/Write with meta buffer Kanchan Joshi
[not found] ` <CGME20240322185731epcas5p20fc525f793a537310f7b3ae5ba5bc75b@epcas5p2.samsung.com>
2024-03-22 18:50 ` [RFC PATCH 1/4] io_uring/rw: Get rid of flags field in struct io_rw Kanchan Joshi
2024-03-27 23:30 ` David Wei
2024-03-27 23:32 ` David Wei
[not found] ` <CGME20240322185734epcas5p2cd407dac97cd157c1833c4022ea84805@epcas5p2.samsung.com>
2024-03-22 18:50 ` Kanchan Joshi [this message]
[not found] ` <CGME20240322185736epcas5p3d0093948e9904e775994bcbe735ea0c5@epcas5p3.samsung.com>
2024-03-22 18:50 ` [RFC PATCH 3/4] block: modify bio_integrity_map_user to accept iov_iter as argument Kanchan Joshi
[not found] ` <CGME20240322185738epcas5p20e5bd448ce83350eb9e79c929c4a9b2b@epcas5p2.samsung.com>
2024-03-22 18:50 ` [RFC PATCH 4/4] block: add support to pass the meta buffer Kanchan Joshi
2024-03-27 23:38 ` [RFC PATCH 0/4] Read/Write with " Jens Axboe
2024-03-28 12:03 ` Kanchan Joshi
2024-04-06 21:30 ` Pavel Begunkov
2024-04-25 19:05 ` Kanchan Joshi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox