From: Kanchan Joshi <[email protected]>
To: [email protected], [email protected], [email protected],
[email protected], [email protected]
Cc: [email protected], [email protected], [email protected],
[email protected], [email protected],
[email protected], Anuj Gupta <[email protected]>,
Kanchan Joshi <[email protected]>,
Nitesh Shetty <[email protected]>
Subject: [PATCH 08/10] io_uring/rw: add support to send meta along with read/write
Date: Fri, 26 Apr 2024 00:09:41 +0530 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Anuj Gupta <[email protected]>
This patch introduces IORING_OP_READ_META and IORING_OP_WRITE_META
opcodes which allow sending a meta buffer along with read/write. The
meta buffer, its length, apptag and integrity check flags can be specified
by the application in the newly introduced meta_buf, meta_len, apptag and
meta_flags fields of SQE.
Use the user-passed information to prepare uio_meta descriptor, and
pass it down using kiocb->private.
Meta exchange is supported only for direct IO.
Signed-off-by: Anuj Gupta <[email protected]>
Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Nitesh Shetty <[email protected]>
---
include/linux/fs.h | 1 +
include/uapi/linux/io_uring.h | 15 +++++++
io_uring/io_uring.c | 4 ++
io_uring/opdef.c | 30 ++++++++++++++
io_uring/rw.c | 76 +++++++++++++++++++++++++++++++++--
io_uring/rw.h | 11 ++++-
6 files changed, 132 insertions(+), 5 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8dfd53b52744..8868d17ae8f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -329,6 +329,7 @@ struct readahead_control;
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
+#define IOCB_USE_META (1 << 22)
/*
* IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
* iocb completion can be passed back to the owner for execution from a safe
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a7f847543a7f..d4653b52fdd6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -97,6 +97,12 @@ struct io_uring_sqe {
__u64 addr3;
__u64 __pad2[1];
};
+ struct {
+ __u64 meta_addr;
+ __u32 meta_len;
+ __u16 meta_flags;
+ __u16 apptag;
+ };
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
@@ -106,6 +112,13 @@ struct io_uring_sqe {
};
};
+/*
+ * meta io flags
+ */
+#define META_CHK_GUARD (1U << 0) /* guard is valid */
+#define META_CHK_APPTAG (1U << 1) /* app tag is valid */
+#define META_CHK_REFTAG (1U << 2) /* ref tag is valid */
+
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
@@ -256,6 +269,8 @@ enum io_uring_op {
IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL,
IORING_OP_FTRUNCATE,
+ IORING_OP_READ_META,
+ IORING_OP_WRITE_META,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3c9087f37c43..af95fc8d988c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3723,7 +3723,11 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
+ BUILD_BUG_SQE_ELEM(48, __u64, meta_addr);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+ BUILD_BUG_SQE_ELEM(56, __u32, meta_len);
+ BUILD_BUG_SQE_ELEM(60, __u16, meta_flags);
+ BUILD_BUG_SQE_ELEM(62, __u16, apptag);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a16f73938ebb..8b8fdcfb7f30 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -444,6 +444,28 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READ_META] = {
+ .needs_file = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_read_meta,
+ .issue = io_rw_meta,
+ },
+ [IORING_OP_WRITE_META] = {
+ .needs_file = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_write_meta,
+ .issue = io_rw_meta,
+ },
[IORING_OP_READ_MULTISHOT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -510,6 +532,14 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
+ [IORING_OP_READ_META] = {
+ .name = "READ_META",
+ .fail = io_rw_fail,
+ },
+ [IORING_OP_WRITE_META] = {
+ .name = "WRITE_META",
+ .fail = io_rw_fail,
+ },
[IORING_OP_FSYNC] = {
.name = "FSYNC",
},
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 3134a6ece1be..b2c9ac91d5e5 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -269,6 +269,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
+ rw->kiocb.ki_flags = 0;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
@@ -286,6 +287,41 @@ int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_prep_rw(req, sqe, ITER_SOURCE, true);
}
+static int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ int ddir, bool import)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io;
+ struct kiocb *kiocb = &rw->kiocb;
+ int ret;
+
+ ret = io_prep_rw(req, sqe, ddir, import);
+ if (unlikely(ret))
+ return ret;
+
+ io = req->async_data;
+ kiocb->ki_flags |= IOCB_USE_META;
+ io->meta.flags = READ_ONCE(sqe->meta_flags);
+ io->meta.apptag = READ_ONCE(sqe->apptag);
+ ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe->meta_addr)),
+ READ_ONCE(sqe->meta_len), &io->meta.iter);
+ if (unlikely(ret < 0))
+ return ret;
+
+ iov_iter_save_state(&io->meta.iter, &io->iter_meta_state);
+ return 0;
+}
+
+int io_prep_read_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rw_meta(req, sqe, ITER_DEST, true);
+}
+
+int io_prep_write_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return io_prep_rw_meta(req, sqe, ITER_SOURCE, true);
+}
+
static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir)
{
@@ -587,6 +623,8 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
req->flags &= ~REQ_F_REISSUE;
iov_iter_restore(&io->iter, &io->iter_state);
+ if (unlikely(rw->kiocb.ki_flags & IOCB_USE_META))
+ iov_iter_restore(&io->meta.iter, &io->iter_meta_state);
return -EAGAIN;
}
return IOU_ISSUE_SKIP_COMPLETE;
@@ -768,7 +806,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(file);
- kiocb->ki_flags = file->f_iocb_flags;
+ kiocb->ki_flags |= file->f_iocb_flags;
ret = kiocb_set_rw_flags(kiocb, rw->flags);
if (unlikely(ret))
return ret;
@@ -787,7 +825,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
- kiocb->private = NULL;
+ if (likely(!(kiocb->ki_flags & IOCB_USE_META)))
+ kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
@@ -853,7 +892,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
} else if (ret == -EIOCBQUEUED) {
return IOU_ISSUE_SKIP_COMPLETE;
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
+ (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
+ (kiocb->ki_flags & IOCB_USE_META)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
@@ -864,6 +904,12 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* manually if we need to.
*/
iov_iter_restore(&io->iter, &io->iter_state);
+ if (unlikely(kiocb->ki_flags & IOCB_USE_META)) {
+ /* don't handle partial completion for read + meta */
+ if (ret > 0)
+ goto done;
+ iov_iter_restore(&io->meta.iter, &io->iter_meta_state);
+ }
do {
/*
@@ -1053,7 +1099,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto ret_eagain;
- if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
+ if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)
+ && !(kiocb->ki_flags & IOCB_USE_META)) {
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
req->cqe.res, ret2);
@@ -1074,12 +1121,33 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
ret_eagain:
iov_iter_restore(&io->iter, &io->iter_state);
+ if (unlikely(kiocb->ki_flags & IOCB_USE_META))
+ iov_iter_restore(&io->meta.iter, &io->iter_meta_state);
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
return -EAGAIN;
}
}
+int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io = req->async_data;
+ struct kiocb *kiocb = &rw->kiocb;
+ int ret;
+
+ if (!(req->file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+
+ kiocb->private = &io->meta;
+ if (req->opcode == IORING_OP_READ_META)
+ ret = io_read(req, issue_flags);
+ else
+ ret = io_write(req, issue_flags);
+
+ return ret;
+}
+
void io_rw_fail(struct io_kiocb *req)
{
int res;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..a640071064e3 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -9,7 +9,13 @@ struct io_async_rw {
struct iovec fast_iov;
struct iovec *free_iovec;
int free_iov_nr;
- struct wait_page_queue wpq;
+ union {
+ struct wait_page_queue wpq;
+ struct {
+ struct uio_meta meta;
+ struct iov_iter_state iter_meta_state;
+ };
+ };
};
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -17,9 +23,12 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read(struct io_kiocb *req, unsigned int issue_flags);
int io_write(struct io_kiocb *req, unsigned int issue_flags);
+int io_rw_meta(struct io_kiocb *req, unsigned int issue_flags);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
--
2.25.1
next prev parent reply other threads:[~2024-04-25 18:47 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20240425184649epcas5p42f6ddbfb1c579f043a919973c70ebd03@epcas5p4.samsung.com>
2024-04-25 18:39 ` [PATCH 00/10] Read/Write with meta/integrity Kanchan Joshi
[not found] ` <CGME20240425184651epcas5p3404f2390d6cf05148eb96e1af093e7bc@epcas5p3.samsung.com>
2024-04-25 18:39 ` [PATCH 01/10] block: set bip_vcnt correctly Kanchan Joshi
2024-04-27 7:02 ` Christoph Hellwig
2024-04-27 14:16 ` Keith Busch
2024-04-29 10:59 ` Kanchan Joshi
2024-05-01 7:45 ` Christoph Hellwig
2024-05-01 8:03 ` Keith Busch
[not found] ` <CGME20240425184653epcas5p28de1473090e0141ae74f8b0a6eb921a7@epcas5p2.samsung.com>
2024-04-25 18:39 ` [PATCH 02/10] block: copy bip_max_vcnt vecs instead of bip_vcnt during clone Kanchan Joshi
2024-04-27 7:03 ` Christoph Hellwig
2024-04-29 11:28 ` Kanchan Joshi
2024-04-29 12:04 ` Keith Busch
2024-04-29 17:07 ` Christoph Hellwig
2024-04-30 8:25 ` Keith Busch
2024-05-01 7:46 ` Christoph Hellwig
2024-05-01 7:50 ` Christoph Hellwig
[not found] ` <CGME20240425184656epcas5p42228cdef753cf20a266d12de5bc130f0@epcas5p4.samsung.com>
2024-04-25 18:39 ` [PATCH 03/10] block: copy result back to user meta buffer correctly in case of split Kanchan Joshi
2024-04-27 7:04 ` Christoph Hellwig
[not found] ` <CGME20240425184658epcas5p2adb6bf01a5c56ffaac3a55ab57afaf8e@epcas5p2.samsung.com>
2024-04-25 18:39 ` [PATCH 04/10] block: avoid unpinning/freeing the bio_vec incase of cloned bio Kanchan Joshi
2024-04-27 7:05 ` Christoph Hellwig
2024-04-29 11:40 ` Kanchan Joshi
2024-04-29 17:09 ` Christoph Hellwig
2024-05-01 13:02 ` Kanchan Joshi
2024-05-02 7:12 ` Christoph Hellwig
2024-05-03 12:01 ` Kanchan Joshi
[not found] ` <CGME20240425184700epcas5p1687590f7e4a3f3c3620ac27af514f0ca@epcas5p1.samsung.com>
2024-04-25 18:39 ` [PATCH 05/10] block, nvme: modify rq_integrity_vec function Kanchan Joshi
2024-04-27 7:18 ` Christoph Hellwig
2024-04-29 11:34 ` Kanchan Joshi
2024-04-29 17:11 ` Christoph Hellwig
[not found] ` <CGME20240425184702epcas5p1ccb0df41b07845bc252d69007558e3fa@epcas5p1.samsung.com>
2024-04-25 18:39 ` [PATCH 06/10] block: modify bio_integrity_map_user argument Kanchan Joshi
2024-04-27 7:19 ` Christoph Hellwig
[not found] ` <CGME20240425184704epcas5p3b9eb6cce9c9658eb1d0d32937e778a5d@epcas5p3.samsung.com>
2024-04-25 18:39 ` [PATCH 07/10] block: define meta io descriptor Kanchan Joshi
[not found] ` <CGME20240425184706epcas5p1d75c19d1d1458c52fc4009f150c7dc7d@epcas5p1.samsung.com>
2024-04-25 18:39 ` Kanchan Joshi [this message]
2024-04-26 14:25 ` [PATCH 08/10] io_uring/rw: add support to send meta along with read/write Jens Axboe
2024-04-29 20:11 ` Kanchan Joshi
[not found] ` <CGME20240425184708epcas5p4f1d95cd8d285614f712868d205a23115@epcas5p4.samsung.com>
2024-04-25 18:39 ` [PATCH 09/10] block: add support to send meta buffer Kanchan Joshi
2024-04-26 15:21 ` Keith Busch
2024-04-29 11:47 ` Kanchan Joshi
[not found] ` <CGME20240425184710epcas5p2968bbc40ed10d1f0184bb511af054fcb@epcas5p2.samsung.com>
2024-04-25 18:39 ` [PATCH 10/10] nvme: add separate handling for user integrity buffer Kanchan Joshi
2024-04-25 19:56 ` Keith Busch
2024-04-26 10:57 ` kernel test robot
2024-04-26 14:19 ` [PATCH 00/10] Read/Write with meta/integrity Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox