public inbox for [email protected]
 help / color / mirror / Atom feed
From: Anuj Gupta <[email protected]>
To: [email protected], [email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected], [email protected],
	[email protected]
Cc: [email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected],
	[email protected], Anuj Gupta <[email protected]>,
	Kanchan Joshi <[email protected]>
Subject: [PATCH v8 06/10] io_uring/rw: add support to send metadata along with read/write
Date: Wed,  6 Nov 2024 17:48:38 +0530	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

This patch adds the capability of passing integrity metadata along with
read/write. A new ext_cap (extended_capability) field is introduced in SQE
which indicates the type of extra information being sent. A new
'struct io_uring_sqe_ext' represents the secondary SQE space for
read/write. In future if another extension needs to be added, then one
needs to:
1. Add extra fields in the sqe/secondary-sqe
2. Introduce a ext_cap flag indicating additional values that have been
passed

The last 32 bytes of secondary SQE is used to pass following PI related
information:

- flags: integrity check flags namely
IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- len: length of the pi/metadata buffer
- buf: address of the metadata buffer
- seed: seed value for reftag remapping
- app_tag: application defined 16b value

Application sets up a SQE128 ring, prepares PI information within the
second SQE and sets the ext_cap field to EXT_CAP_PI.  The patch processes
this information to prepare uio_meta descriptor and passes it down using
kiocb->private.

Meta exchange is supported only for direct IO.
Also vectored read/write operations with meta are not supported
currently.

Signed-off-by: Anuj Gupta <[email protected]>
Signed-off-by: Kanchan Joshi <[email protected]>
---
 include/uapi/linux/io_uring.h | 34 ++++++++++++++
 io_uring/io_uring.c           |  8 ++++
 io_uring/rw.c                 | 88 ++++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 | 14 +++++-
 4 files changed, 141 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 56cf30b49ef5..449e7627b1b5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,6 +92,11 @@ struct io_uring_sqe {
 			__u16	addr_len;
 			__u16	__pad3[1];
 		};
+		struct {
+			/* flags indicating additional information being passed */
+			__u16	ext_cap;
+			__u16	__pad4[1];
+		};
 	};
 	union {
 		struct {
@@ -107,6 +112,35 @@ struct io_uring_sqe {
 	};
 };
 
+enum io_uring_sqe_ext_cap_bits {
+	EXT_CAP_PI_BIT,
+	/*
+	 * not a real extended capability; just to make sure that we don't
+	 * overflow
+	 */
+	EXT_CAP_LAST_BIT,
+};
+
+/* extended capability flags */
+#define EXT_CAP_PI	(1U << EXT_CAP_PI_BIT)
+
+/* Second half of SQE128 for IORING_OP_READ/WRITE */
+struct io_uring_sqe_ext {
+	__u64	rsvd0[4];
+	/* if sqe->ext_cap is EXT_CAP_PI, last 32 bytes are for PI */
+	union {
+		__u64	rsvd1[4];
+		struct {
+			__u16	flags;
+			__u16	app_tag;
+			__u32	len;
+			__u64	addr;
+			__u64	seed;
+			__u64	rsvd;
+		} rw_pi;
+	};
+};
+
 /*
  * If sqe->file_index is set to this for opcodes that instantiate a new
  * direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b590e50f09e7..6e582fe93bc4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4165,7 +4165,9 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
+	BUILD_BUG_SQE_ELEM(44, __u16,  ext_cap);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
+	BUILD_BUG_SQE_ELEM(46, __u16,  __pad4[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
@@ -4192,6 +4194,12 @@ static int __init io_uring_init(void)
 	/* top 8bits are for internal use */
 	BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
 
+	BUILD_BUG_ON(sizeof(struct io_uring_sqe_ext) !=
+		     sizeof(struct io_uring_sqe));
+
+	BUILD_BUG_ON(EXT_CAP_LAST_BIT >
+		     8 * sizeof_field(struct io_uring_sqe, ext_cap));
+
 	io_uring_optable_init();
 
 	/*
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 768a908ca2a8..e60bf0ed4c4f 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,64 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	return 0;
 }
 
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+	io->meta_state.seed = io->meta.seed;
+	iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+	io->meta.seed = io->meta_state.seed;
+	iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline const void *io_uring_sqe_ext(const struct io_uring_sqe *sqe)
+{
+	return (sqe + 1);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   struct io_rw *rw, int ddir)
+{
+	const struct io_uring_sqe_ext *sqe_ext;
+	const struct io_issue_def *def;
+	struct io_async_rw *io;
+	int ret;
+
+	if (!(req->ctx->flags & IORING_SETUP_SQE128))
+		return -EINVAL;
+
+	sqe_ext = io_uring_sqe_ext(sqe);
+	if (READ_ONCE(sqe_ext->rsvd0[0]) || READ_ONCE(sqe_ext->rsvd0[1])
+	    || READ_ONCE(sqe_ext->rsvd0[2]) || READ_ONCE(sqe_ext->rsvd0[3]))
+		return -EINVAL;
+	if (READ_ONCE(sqe_ext->rw_pi.rsvd))
+		return -EINVAL;
+
+	def = &io_issue_defs[req->opcode];
+	if (def->vectored)
+		return -EOPNOTSUPP;
+
+	io = req->async_data;
+	io->meta.flags = READ_ONCE(sqe_ext->rw_pi.flags);
+	io->meta.app_tag = READ_ONCE(sqe_ext->rw_pi.app_tag);
+	io->meta.seed = READ_ONCE(sqe_ext->rw_pi.seed);
+	ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe_ext->rw_pi.addr)),
+			  READ_ONCE(sqe_ext->rw_pi.len), &io->meta.iter);
+	if (unlikely(ret < 0))
+		return ret;
+	rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+	io_meta_save_state(io);
+	return ret;
+}
+
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
+	u16 ext_cap;
 	int ret;
 
 	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +332,23 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		rw->kiocb.ki_ioprio = get_current_ioprio();
 	}
 	rw->kiocb.dio_complete = NULL;
+	rw->kiocb.ki_flags = 0;
 
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return io_prep_rw_setup(req, ddir, do_import);
+	ret = io_prep_rw_setup(req, ddir, do_import);
+
+	if (unlikely(ret))
+		return ret;
+
+	ext_cap = READ_ONCE(sqe->ext_cap);
+	if (ext_cap) {
+		if (READ_ONCE(sqe->__pad4[0]) || !(ext_cap & EXT_CAP_PI))
+			return -EINVAL;
+		ret = io_prep_rw_pi(req, sqe, rw, ddir);
+	}
+	return ret;
 }
 
 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -410,7 +475,10 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
+	if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 	iov_iter_restore(&io->iter, &io->iter_state);
 }
 
@@ -795,7 +863,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file);
 
-	kiocb->ki_flags = file->f_iocb_flags;
+	kiocb->ki_flags |= file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
 	if (unlikely(ret))
 		return ret;
@@ -829,6 +897,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_complete = io_complete_rw;
 	}
 
+	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+		struct io_async_rw *io = req->async_data;
+
+		/*
+		 * We have a union of meta fields with wpq used for buffered-io
+		 * in io_async_rw, so fail it here.
+		 */
+		if (!(req->file->f_flags & O_DIRECT))
+			return -EOPNOTSUPP;
+		kiocb->private = &io->meta;
+	}
+
 	return 0;
 }
 
@@ -903,6 +983,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * manually if we need to.
 	 */
 	iov_iter_restore(&io->iter, &io->iter_state);
+	if (kiocb->ki_flags & IOCB_HAS_METADATA)
+		io_meta_restore(io);
 
 	do {
 		/*
@@ -1126,6 +1208,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	} else {
 ret_eagain:
 		iov_iter_restore(&io->iter, &io->iter_state);
+		if (kiocb->ki_flags & IOCB_HAS_METADATA)
+			io_meta_restore(io);
 		if (kiocb->ki_flags & IOCB_WRITE)
 			io_req_end_write(req);
 		return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@
 
 #include <linux/pagemap.h>
 
+struct io_meta_state {
+	u32			seed;
+	struct iov_iter_state	iter_meta;
+};
+
 struct io_async_rw {
 	size_t				bytes_done;
 	struct iov_iter			iter;
@@ -9,7 +14,14 @@ struct io_async_rw {
 	struct iovec			fast_iov;
 	struct iovec			*free_iovec;
 	int				free_iov_nr;
-	struct wait_page_queue		wpq;
+	/* wpq is for buffered io, while meta fields are used with direct io */
+	union {
+		struct wait_page_queue		wpq;
+		struct {
+			struct uio_meta			meta;
+			struct io_meta_state		meta_state;
+		};
+	};
 };
 
 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-- 
2.25.1


  parent reply	other threads:[~2024-11-06 14:29 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20241106122631epcas5p2575c59b7634e0077f8e5c654b5fd5dbb@epcas5p2.samsung.com>
2024-11-06 12:18 ` [PATCH v8 00/10] Read/Write with meta/integrity Anuj Gupta
     [not found]   ` <CGME20241106122656epcas5p2aa5c312dd186b125b7c3f1af199b46d8@epcas5p2.samsung.com>
2024-11-06 12:18     ` [PATCH v8 01/10] block: define set of integrity flags to be inherited by cloned bip Anuj Gupta
     [not found]   ` <CGME20241106122659epcas5p28505a2288f3ac24408aaf138ac053793@epcas5p2.samsung.com>
2024-11-06 12:18     ` [PATCH v8 02/10] block: copy back bounce buffer to user-space correctly in case of split Anuj Gupta
     [not found]   ` <CGME20241106122701epcas5p379d6d2c0f7a758f959e02a363ee6871d@epcas5p3.samsung.com>
2024-11-06 12:18     ` [PATCH v8 03/10] block: modify bio_integrity_map_user to accept iov_iter as argument Anuj Gupta
     [not found]   ` <CGME20241106122704epcas5p37a156fb2738c3b8194e8f81c26a07878@epcas5p3.samsung.com>
2024-11-06 12:18     ` [PATCH v8 04/10] fs, iov_iter: define meta io descriptor Anuj Gupta
     [not found]   ` <CGME20241106122707epcas5p4569e1ddc2ded4de5da6663fb7ffc9464@epcas5p4.samsung.com>
2024-11-06 12:18     ` [PATCH v8 05/10] fs: introduce IOCB_HAS_METADATA for metadata Anuj Gupta
     [not found]   ` <CGME20241106122710epcas5p2b314c865f8333c890dd6f22cf2edbe2f@epcas5p2.samsung.com>
2024-11-06 12:18     ` Anuj Gupta [this message]
2024-11-07  5:55       ` [PATCH v8 06/10] io_uring/rw: add support to send metadata along with read/write Christoph Hellwig
2024-11-07  7:26         ` Anuj gupta
2024-11-07  7:38           ` Christoph Hellwig
2024-11-07 10:40             ` Anuj Gupta
2024-11-07 11:44               ` Christoph Hellwig
2024-11-12  0:54           ` Pavel Begunkov
2024-11-12  6:51             ` Anuj Gupta
     [not found]   ` <CGME20241106122713epcas5p433c4b14f82da79176c3e8bf6835054fe@epcas5p4.samsung.com>
2024-11-06 12:18     ` [PATCH v8 07/10] block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags Anuj Gupta
     [not found]   ` <CGME20241106122715epcas5p1ccc25dc0bbfae6db881fecb6bd00d5e0@epcas5p1.samsung.com>
2024-11-06 12:18     ` [PATCH v8 08/10] nvme: add support for passing on the application tag Anuj Gupta
     [not found]   ` <CGME20241106122718epcas5p184094be8de895aa606170da9f4204ae0@epcas5p1.samsung.com>
2024-11-06 12:18     ` [PATCH v8 09/10] scsi: add support for user-meta interface Anuj Gupta
     [not found]   ` <CGME20241106122721epcas5p3849e3582fcb6547be2b1b10b031138d4@epcas5p3.samsung.com>
2024-11-06 12:18     ` [PATCH v8 10/10] block: add support to pass user meta buffer Anuj Gupta

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox