public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v5 0/3] implement direct IO with integrity
@ 2022-09-20 14:46 Alexander V. Buev
  2022-09-20 14:46 ` [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Alexander V. Buev @ 2022-09-20 14:46 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

This series of patches makes possible to do direct block IO
with integrity payload using io uring kernel interface.
Userspace app can utilize new READV_PI/WRITEV_PI operation with a new
fields in sqe struct (pi_addr/pi_len) to provide iovec's with
integrity data.

Changes since v4:
 - fixed sparse warnings reported by robot
 - some litle code sync rw.c -> rw_pi.c
 - some includes cleanup

Changes since v3:
 - fixed warnings reported by robot 

Changes since v2:
 - separate code from fast path
 - keep rw_pi struct size <= 64 byte
 - using kiocb->private pointer to pass
   PI data iterator to block direct IO layer   
 - improved bio_integrity_add_iovec function 

Alexander V. Buev (3):
  block: bio-integrity: add PI iovec to bio
  block: io-uring: add READV_PI/WRITEV_PI operations
  block: fops: handle IOCB_USE_PI in direct IO

 block/bio-integrity.c         | 163 +++++++++
 block/fops.c                  |  80 +++++
 include/linux/bio.h           |   8 +
 include/linux/fs.h            |   1 +
 include/uapi/linux/io_uring.h |   6 +
 include/uapi/linux/uio.h      |   3 +-
 io_uring/Makefile             |   3 +-
 io_uring/io_uring.c           |   2 +
 io_uring/opdef.c              |  27 ++
 io_uring/rw.h                 |   4 +
 io_uring/rw_pi.c              | 630 ++++++++++++++++++++++++++++++++++
 io_uring/rw_pi.h              |  34 ++
 12 files changed, 959 insertions(+), 2 deletions(-)
 create mode 100644 io_uring/rw_pi.c
 create mode 100644 io_uring/rw_pi.h

-- 
2.30.2


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio
  2022-09-20 14:46 [PATCH v5 0/3] implement direct IO with integrity Alexander V. Buev
@ 2022-09-20 14:46 ` Alexander V. Buev
  2022-09-27  7:47   ` Christoph Hellwig
  2022-09-20 14:46 ` [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Alexander V. Buev @ 2022-09-20 14:46 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

Added functions to attach user PI iovec pages to bio and release this
pages via bio_integrity_free.

Signed-off-by: Alexander V. Buev <[email protected]>
---
 block/bio-integrity.c | 163 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h   |   8 +++
 2 files changed, 171 insertions(+)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 3f5685c00e36..bd6b74ae2c95 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -10,6 +10,7 @@
 #include <linux/mempool.h>
 #include <linux/export.h>
 #include <linux/bio.h>
+#include <linux/uio.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include "blk.h"
@@ -91,6 +92,18 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+void bio_integrity_release_pages(struct bio *bio)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct bio_vec *bv = bip->bip_vec;
+	unsigned short i;
+
+	for (i = 0; i < bip->bip_vcnt; i++) {
+		put_page(bv->bv_page);
+		bv++;
+	}
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +118,10 @@ void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	else {
+		if (bip->bip_flags & BIP_RELEASE_PAGES)
+			bio_integrity_release_pages(bio);
+	}
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -378,6 +395,152 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
 }
 
+static inline
+struct page **__bio_integrity_temp_pages(struct bio *bio, unsigned int nr_needed_page)
+{
+	unsigned int nr_avail_page = 0;
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+
+	if (bip->bip_max_vcnt > nr_needed_page) {
+		nr_avail_page = (bip->bip_max_vcnt - nr_needed_page) *
+			sizeof(struct bio_vec)/sizeof(struct page *);
+	}
+
+	if (nr_avail_page >= nr_needed_page)
+		return (struct page **) (bip->bip_vec + nr_needed_page);
+	else {
+		if (bio->bi_max_vecs - bio->bi_vcnt) {
+			nr_avail_page = (bio->bi_max_vecs - bio->bi_vcnt) *
+				sizeof(struct bio_vec)/sizeof(struct page *);
+			if (nr_avail_page >= nr_needed_page)
+				return (struct page **) (bio->bi_io_vec + bio->bi_vcnt);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * bio_integrity_add_iovec - Add PI io vector
+ * @bio:	bio whose integrity vector to update
+ * @pi_iter:	iov_iter pointed to data added to @bio's integrity
+ *
+ * Description: Pins pages for *pi_iov and appends them to @bio's integrity.
+ */
+int bio_integrity_add_iovec(struct bio *bio, struct iov_iter *pi_iter)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct bio_integrity_payload *bip;
+	struct page **pi_page = 0, **bio_page;
+	unsigned int nr_vec_page;
+	int ret;
+	ssize_t size;
+	size_t offset, pg_num, page_count;
+
+	if (unlikely(!(bi && bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE))) {
+		pr_err("Device %d:%d is not integrity capable",
+			MAJOR(bio->bi_bdev->bd_dev), MINOR(bio->bi_bdev->bd_dev));
+		return -EINVAL;
+	}
+
+	nr_vec_page = iov_iter_npages(pi_iter,
+		queue_max_integrity_segments(bdev_get_queue(bio->bi_bdev)));
+	bip = bio_integrity(bio);
+	if (bip) {
+		if (nr_vec_page > (bip->bip_max_vcnt - bip->bip_vcnt))
+			return -ENOMEM;
+	} else {
+		bip = bio_integrity_alloc(bio, GFP_NOIO, nr_vec_page);
+		if (IS_ERR(bip))
+			return PTR_ERR(bip);
+	}
+
+	/* get space for page pointers array */
+	bio_page = __bio_integrity_temp_pages(bio, nr_vec_page);
+
+	if (likely(bio_page))
+		pi_page = bio_page;
+	else {
+		pi_page = kcalloc(nr_vec_page,
+					sizeof(struct pi_page *), GFP_NOIO);
+		if (!pi_page) {
+			ret = -ENOMEM;
+			goto error;
+		}
+	}
+
+	bip->bip_iter.bi_size = pi_iter->count;
+	bip->bio_iter = bio->bi_iter;
+	bip_set_seed(bip, bio->bi_iter.bi_sector);
+
+	if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
+		bip->bip_flags |= BIP_IP_CHECKSUM;
+
+	do {
+		size = iov_iter_get_pages2(pi_iter, pi_page, LONG_MAX,
+						nr_vec_page, &offset);
+		if (unlikely(size <= 0)) {
+			pr_err("Failed to pin integrity buffer for %d:%d\n",
+				MAJOR(bio->bi_bdev->bd_dev),
+				MINOR(bio->bi_bdev->bd_dev));
+			pr_err("Buffer size=%zu pages=%u err=%zi\n",
+				pi_iter->count, nr_vec_page, size);
+			ret = (size) ? size : -EFAULT;
+			goto error;
+		}
+
+		page_count = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+
+		/* fill bio integrity biovecs the given pages */
+		for (pg_num = 0; pg_num < page_count; ++pg_num) {
+			size_t page_len;
+
+			page_len = min_t(size_t, PAGE_SIZE - offset, size);
+			ret = bio_integrity_add_page(bio, pi_page[pg_num],
+							page_len, offset);
+			if (unlikely(ret != page_len)) {
+				while ((1 + pg_num) > 0) {
+					put_page(pi_page[pg_num]);
+					pg_num--;
+				}
+				ret = -ENOMEM;
+				goto error;
+			}
+			size -= page_len;
+			offset = 0;
+			bip->bip_flags |= BIP_RELEASE_PAGES;
+		}
+
+		nr_vec_page -= page_count;
+
+	} while (pi_iter->count && nr_vec_page);
+
+
+	if (pi_iter->count) {
+		pr_err("Failed to pin whole integrity buffer for %d:%d\n",
+			MAJOR(bio->bi_bdev->bd_dev),
+			MINOR(bio->bi_bdev->bd_dev));
+		pr_err("Data of size=%zi not pined\n", pi_iter->count);
+		ret = -EFAULT;
+		goto error;
+	}
+
+	if (pi_page != bio_page)
+		kfree(pi_page);
+
+	return 0;
+
+error:
+	if (bio_integrity(bio))
+		bio_integrity_free(bio);
+
+	if (pi_page && pi_page != bio_page)
+		kfree(pi_page);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_add_iovec);
+
 /**
  * bio_integrity_trim - Trim integrity vector
  * @bio:	bio whose integrity vector to update
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ca22b06700a9..e7e328425c90 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -317,6 +317,7 @@ enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_RELEASE_PAGES	= 1 << 5, /* release pages after io completion */
 };
 
 /*
@@ -699,6 +700,7 @@ extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, un
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern bool bio_integrity_prep(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
+extern int bio_integrity_add_iovec(struct bio *bio, struct iov_iter *iter);
 extern void bio_integrity_trim(struct bio *);
 extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
 extern int bioset_integrity_create(struct bio_set *, int);
@@ -739,6 +741,12 @@ static inline void bio_integrity_advance(struct bio *bio,
 	return;
 }
 
+static inline int bio_integrity_add_iovec(struct bio *bio,
+					struct iov_iter *pi_iter)
+{
+	return 0;
+}
+
 static inline void bio_integrity_trim(struct bio *bio)
 {
 	return;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations
  2022-09-20 14:46 [PATCH v5 0/3] implement direct IO with integrity Alexander V. Buev
  2022-09-20 14:46 ` [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
@ 2022-09-20 14:46 ` Alexander V. Buev
  2022-09-21 17:59   ` Jens Axboe
  2022-09-20 14:46 ` [PATCH v5 3/3] block: fops: handle IOCB_USE_PI in direct IO Alexander V. Buev
  2022-09-20 20:12 ` [PATCH v5 0/3] implement direct IO with integrity Keith Busch
  3 siblings, 1 reply; 11+ messages in thread
From: Alexander V. Buev @ 2022-09-20 14:46 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

Added new READV_PI/WRITEV_PI operations to io_uring.
Added new pi_addr & pi_len fields to SQE struct.
Added new IOCB_USE_PI flag to kiocb struct.
Use kiocb->private pointer to pass PI data
iterator to low layer.

Signed-off-by: Alexander V. Buev <[email protected]>
---
 include/linux/fs.h            |   1 +
 include/uapi/linux/io_uring.h |   6 +
 include/uapi/linux/uio.h      |   3 +-
 io_uring/Makefile             |   3 +-
 io_uring/io_uring.c           |   2 +
 io_uring/opdef.c              |  27 ++
 io_uring/rw.h                 |   4 +
 io_uring/rw_pi.c              | 630 ++++++++++++++++++++++++++++++++++
 io_uring/rw_pi.h              |  34 ++
 9 files changed, 708 insertions(+), 2 deletions(-)
 create mode 100644 io_uring/rw_pi.c
 create mode 100644 io_uring/rw_pi.h

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..a28b12a22750 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -337,6 +337,7 @@ enum rw_hint {
 #define IOCB_NOIO		(1 << 20)
 /* can use bio alloc cache */
 #define IOCB_ALLOC_CACHE	(1 << 21)
+#define IOCB_USE_PI		(1 << 22)
 
 struct kiocb {
 	struct file		*ki_filp;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6b83177fd41d..a4158e48cecb 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -80,6 +80,10 @@ struct io_uring_sqe {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		struct {
+			__u64	pi_addr;
+			__u32	pi_len;
+		};
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
 		 * this field is used for 80 bytes of arbitrary command data
@@ -206,6 +210,8 @@ enum io_uring_op {
 	IORING_OP_SOCKET,
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
+	IORING_OP_READV_PI,
+	IORING_OP_WRITEV_PI,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..c9eaaa6cdb0f 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -23,9 +23,10 @@ struct iovec
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
- 
+
 #define UIO_FASTIOV	8
 #define UIO_MAXIOV	1024
+#define UIO_FASTIOV_PI	1
 
 
 #endif /* _UAPI__LINUX_UIO_H */
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75..8c01546c2bcf 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o \
+					notif.o rw_pi.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b9640ad5069f..b2f451e18646 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3927,7 +3927,9 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
+	BUILD_BUG_SQE_ELEM(48, __u64,  pi_addr);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+	BUILD_BUG_SQE_ELEM(56, __u32,  pi_len);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index c61494e0a602..da2b12a44995 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
 #include "poll.h"
 #include "cancel.h"
 #include "rw.h"
+#include "rw_pi.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -488,6 +489,32 @@ const struct io_op_def io_op_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READV_PI] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw_pi),
+		.name			= "READV_PI",
+		.prep			= io_prep_rw_pi,
+		.issue			= io_readv_pi,
+		.prep_async		= io_readv_pi_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+	},
+	[IORING_OP_WRITEV_PI] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw_pi),
+		.name			= "WRITEV_PI",
+		.prep			= io_prep_rw_pi,
+		.issue			= io_writev_pi,
+		.prep_async		= io_writev_pi_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 0204c3fcafa5..c00ece398540 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_RW_H
+#define IOU_RW_H
 
 #include <linux/pagemap.h>
 
@@ -21,3 +23,5 @@ int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
 int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
+
+#endif
diff --git a/io_uring/rw_pi.c b/io_uring/rw_pi.c
new file mode 100644
index 000000000000..ecee133d709c
--- /dev/null
+++ b/io_uring/rw_pi.c
@@ -0,0 +1,630 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fsnotify.h>
+#include <linux/poll.h>
+#include <linux/nospec.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "opdef.h"
+#include "rw_pi.h"
+
+#define io_kiocb_to_kiocb(req, type) \
+				(&((type *)io_kiocb_to_cmd(req, type))->kiocb)
+#define DATA	(0)
+#define PI	(1)
+
+#define u64_to_ptr(x)	(		\
+{					\
+	typecheck(u64, (x));		\
+	(void *)(uintptr_t)(x);		\
+})
+
+struct io_rw_pi {
+	struct kiocb			kiocb;
+	u64				addr;
+	u32				nr_segs;
+	u32				nr_pi_segs;
+};
+
+static inline
+void io_rw_pi_state_iter_restore(struct io_rw_state *data, struct __io_rw_pi_state *pi)
+{
+	iov_iter_restore(&data->iter, &data->iter_state);
+	iov_iter_restore(&pi->iter, &pi->iter_state);
+}
+
+static inline
+void io_rw_pi_state_iter_save(struct io_rw_state *data, struct __io_rw_pi_state *pi)
+{
+	iov_iter_save_state(&data->iter, &data->iter_state);
+	iov_iter_save_state(&pi->iter, &pi->iter_state);
+}
+
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_SUPPORT_NOWAIT;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+	switch (ret) {
+	case -EIOCBQUEUED:
+		break;
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/*
+		 * We can't just restart the syscall, since previously
+		 * submitted sqes may already be in progress. Just fail this
+		 * IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		kiocb->ki_complete(kiocb, ret);
+	}
+}
+
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+
+	if (rw->kiocb.ki_pos != -1)
+		return &rw->kiocb.ki_pos;
+
+	if (!(req->file->f_mode & FMODE_STREAM)) {
+		req->flags |= REQ_F_CUR_POS;
+		rw->kiocb.ki_pos = req->file->f_pos;
+		return &rw->kiocb.ki_pos;
+	}
+
+	rw->kiocb.ki_pos = 0;
+	return NULL;
+}
+
+static void io_req_task_queue_reissue(struct io_kiocb *req)
+{
+	req->io_task_work.func = io_queue_iowq;
+	io_req_task_work_add(req);
+}
+
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	struct io_async_rw_pi *arw = req->async_data;
+
+	if (!req_has_async_data(req))
+		return !io_req_prep_async(req);
+	io_rw_pi_state_iter_restore(&arw->data.s, &arw->pi.s);
+	return true;
+}
+
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+	    !(ctx->flags & IORING_SETUP_IOPOLL)))
+		return false;
+	/*
+	 * If ref is dying, we might be running poll reap from the exit work.
+	 * Don't attempt to reissue from that path, just let it fail with
+	 * -EAGAIN.
+	 */
+	if (percpu_ref_is_dying(&ctx->refs))
+		return false;
+	/*
+	 * Play it safe and assume not safe to re-import and reissue if we're
+	 * not in the original thread group (or in task context).
+	 */
+	if (!same_thread_group(req->task, current) || !in_task())
+		return false;
+	return true;
+}
+
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+
+	if (rw->kiocb.ki_flags & IOCB_WRITE)
+		fsnotify_modify(req->file);
+	else
+		fsnotify_access(req->file);
+
+	if (unlikely(res != req->cqe.res)) {
+		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
+		    io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+			return true;
+		}
+		req_set_fail(req);
+		req->cqe.res = res;
+	}
+	return false;
+}
+
+static inline
+unsigned int io_fixup_rw_res(struct io_kiocb *req, unsigned int res)
+{
+	struct io_async_rw_pi *arw = req->async_data;
+
+	/* add previously done IO, if any */
+	if (req_has_async_data(req) && arw->data.bytes_done > 0) {
+		if (res < 0)
+			res = arw->data.bytes_done;
+		else
+			res += arw->data.bytes_done;
+	}
+	return res;
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res)
+{
+	struct io_rw_pi *rw = container_of(kiocb, struct io_rw_pi, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
+
+	if (__io_complete_rw_common(req, res))
+		return;
+	io_req_set_res(req, io_fixup_rw_res(req, res), 0);
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
+{
+	struct io_rw_pi *rw = container_of(kiocb, struct io_rw_pi, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
+
+	if (unlikely(res != req->cqe.res)) {
+		if (res == -EAGAIN && io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+			return;
+		}
+		req->cqe.res = res;
+	}
+
+	/* order with io_iopoll_complete() checking ->iopoll_completed */
+	smp_store_release(&req->iopoll_completed, 1);
+}
+
+static int kiocb_done(struct io_kiocb *req, ssize_t ret,
+		       unsigned int issue_flags)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+	unsigned int final_ret = io_fixup_rw_res(req, ret);
+
+	if (req->flags & REQ_F_CUR_POS)
+		req->file->f_pos = rw->kiocb.ki_pos;
+	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
+		if (!__io_complete_rw_common(req, ret)) {
+			io_req_set_res(req, final_ret, 0);
+			return IOU_OK;
+		}
+	} else {
+		io_rw_done(&rw->kiocb, ret);
+	}
+
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		if (io_resubmit_prep(req))
+			io_req_task_queue_reissue(req);
+		else
+			io_req_task_queue_fail(req, ret);
+	}
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+	struct kiocb *kiocb = &rw->kiocb;
+	unsigned int ioprio;
+	int ret;
+
+	kiocb->ki_flags = 0;
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		return ret;
+
+	kiocb->ki_pos = READ_ONCE(sqe->off);
+
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
+		if (ret)
+			return ret;
+
+		kiocb->ki_ioprio = ioprio;
+	} else {
+		kiocb->ki_ioprio = get_current_ioprio();
+	}
+
+	req->imu = NULL;
+
+	/* save data iovec pointer & len */
+	rw->addr = (uintptr_t)READ_ONCE(sqe->addr);
+	rw->nr_segs = READ_ONCE(sqe->len);
+
+	/* save pi iovec pointer & len */
+	rw->kiocb.private = u64_to_ptr(READ_ONCE(sqe->pi_addr));
+	rw->nr_pi_segs = READ_ONCE(sqe->pi_len);
+
+	kiocb->ki_flags |= IOCB_USE_PI;
+
+	return 0;
+}
+
+
+static inline int
+io_import_iovecs_pi(int io_dir, struct io_kiocb *req, struct iovec **iovec,
+			struct io_rw_state *s_data, struct __io_rw_pi_state *s_pi)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+	struct iovec __user *uvec;
+	ssize_t ret;
+
+	/* data */
+	uvec = (struct iovec __user *)u64_to_user_ptr(rw->addr);
+	iovec[DATA] = s_data->fast_iov;
+	ret = __import_iovec(io_dir, uvec, rw->nr_segs,
+				UIO_FASTIOV, iovec + DATA,
+				&s_data->iter, req->ctx->compat);
+
+	if (unlikely(ret <= 0))
+		return (ret) ? ret : -EINVAL;
+	/* pi */
+	uvec = (struct iovec __user *)rw->kiocb.private;
+	iovec[PI] = s_pi->fast_iov;
+	ret = __import_iovec(io_dir, uvec, rw->nr_pi_segs,
+				UIO_FASTIOV_PI, iovec + PI,
+				&s_pi->iter, req->ctx->compat);
+	if (unlikely(ret <= 0)) {
+		if (iovec[DATA])
+			kfree(iovec[DATA]);
+		return (ret) ? ret : -EINVAL;
+	}
+
+	/* save states */
+	io_rw_pi_state_iter_save(s_data, s_pi);
+
+	return 0;
+}
+
+static inline void
+io_setup_async_state(struct io_rw_state *async_s, const struct io_rw_state *s)
+{
+	unsigned int iov_off = 0;
+
+	async_s->iter.iov = async_s->fast_iov;
+	if (s->iter.iov != s->fast_iov) {
+		iov_off = s->iter.iov - s->fast_iov;
+		async_s->iter.iov += iov_off;
+	}
+	if (async_s->fast_iov != s->fast_iov) {
+		memcpy(async_s->fast_iov + iov_off, s->fast_iov + iov_off,
+			       sizeof(struct iovec) * s->iter.nr_segs);
+	}
+}
+
+static int
+io_setup_async_rw_pi(struct io_kiocb *req, struct iovec * const *iovec,
+			struct io_rw_state *s_data,
+			struct __io_rw_pi_state *s_pi)
+{
+	struct io_async_rw_pi *arw;
+
+	if (req_has_async_data(req))
+		return 0;
+
+	if (io_alloc_async_data(req))
+		return -ENOMEM;
+
+	arw = req->async_data;
+
+	/* data */
+	arw->data.s.iter = s_data->iter;
+	arw->data.free_iovec = iovec[DATA];
+	arw->data.bytes_done = 0;
+
+	if (iovec[DATA])
+		req->flags |= REQ_F_NEED_CLEANUP;
+	else
+		io_setup_async_state(&arw->data.s, s_data);
+
+	/* pi */
+	arw->pi.s.iter = s_pi->iter;
+	arw->pi.free_iovec = iovec[PI];
+
+	if (iovec[PI])
+		req->flags |= REQ_F_NEED_CLEANUP;
+	else {
+		io_setup_async_state((struct io_rw_state *)&arw->pi.s,
+					(const struct io_rw_state *)s_pi);
+	}
+
+	/* save states */
+	io_rw_pi_state_iter_save(&arw->data.s, &arw->pi.s);
+
+	return 0;
+}
+
+static inline int io_rw_pi_prep_async(struct io_kiocb *req, int io_dir)
+{
+	int ret = 0;
+	struct io_async_rw_pi *arw = req->async_data;
+	struct iovec *iovec[2];
+
+	ret = io_import_iovecs_pi(io_dir, req, iovec,
+					&arw->data.s, &arw->pi.s);
+	if (unlikely(ret < 0))
+		return ret;
+
+	arw->data.bytes_done = 0;
+	arw->data.free_iovec = iovec[DATA];
+	arw->pi.free_iovec = iovec[PI];
+
+	if (iovec[DATA] || iovec[PI])
+		req->flags |= REQ_F_NEED_CLEANUP;
+
+	return 0;
+}
+
+
+int io_readv_pi_prep_async(struct io_kiocb *req)
+{
+	return io_rw_pi_prep_async(req, READ);
+}
+
+int io_writev_pi_prep_async(struct io_kiocb *req)
+{
+	return io_rw_pi_prep_async(req, WRITE);
+}
+
+static int io_rw_pi_init_file(struct io_kiocb *req, fmode_t mode)
+{
+	struct kiocb *kiocb = io_kiocb_to_kiocb(req, struct io_rw_pi);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file = req->file;
+	int flags;
+
+	if (unlikely(!file || !(file->f_mode & mode)))
+		return -EBADF;
+
+	if (unlikely(!S_ISBLK(file_inode(req->file)->i_mode)))
+		return -EINVAL;
+
+	if (unlikely(!(file->f_flags & O_DIRECT)))
+		return -EINVAL;
+
+	if (!io_req_ffs_set(req))
+		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+	flags = kiocb->ki_flags;
+	kiocb->ki_flags = iocb_flags(file);
+	kiocb->ki_flags |= flags;
+
+	/*
+	 * If the file is marked O_NONBLOCK, still allow retry for it if it
+	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
+	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+	 */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
+		req->flags |= REQ_F_NOWAIT;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
+			return -EOPNOTSUPP;
+
+		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+		kiocb->ki_complete = io_complete_rw_iopoll;
+		req->iopoll_completed = 0;
+	} else {
+		if (kiocb->ki_flags & IOCB_HIPRI)
+			return -EINVAL;
+		kiocb->ki_complete = io_complete_rw;
+	}
+
+	return 0;
+}
+
+void io_readv_writev_pi_cleanup(struct io_kiocb *req)
+{
+	struct io_async_rw_pi *arw = req->async_data;
+
+	kfree(arw->data.free_iovec);
+	kfree(arw->pi.free_iovec);
+}
+
+int io_readv_pi(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw_pi_state s;
+	struct io_rw_state *s_data;
+	struct __io_rw_pi_state *s_pi;
+	struct iovec *iovec[2];
+	struct kiocb *kiocb = io_kiocb_to_kiocb(req, struct io_rw_pi);
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	ssize_t ret;
+	loff_t *ppos;
+
+	if (!req_has_async_data(req)) {
+		s_data = &s.data;
+		s_pi = &s.pi;
+		ret = io_import_iovecs_pi(READ, req, iovec, s_data, s_pi);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		struct io_async_rw_pi *arw = req->async_data;
+
+		iovec[DATA] = iovec[PI] = NULL;
+		s_data = &arw->data.s;
+		s_pi = &arw->pi.s;
+		io_rw_pi_state_iter_restore(s_data, s_pi);
+	}
+	kiocb->private = &s_pi->iter;
+
+	ret = io_rw_pi_init_file(req, FMODE_READ);
+	if (unlikely(ret))
+		goto out_free;
+
+	req->cqe.res = iov_iter_count(&s_data->iter);
+	if (force_nonblock) {
+		/* If the file doesn't support async, just async punt */
+		if (unlikely(!io_file_supports_nowait(req))) {
+			ret = io_setup_async_rw_pi(req, iovec, s_data, s_pi);
+			return ret ?: -EAGAIN;
+		}
+		kiocb->ki_flags |= IOCB_NOWAIT;
+	} else {
+		/* Ensure we clear previously set non-block flag */
+		kiocb->ki_flags &= ~IOCB_NOWAIT;
+	}
+
+	ppos = io_kiocb_update_pos(req);
+
+	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
+	if (unlikely(ret))
+		goto out_free;
+
+	if (likely(req->file->f_op->read_iter))
+		ret = call_read_iter(req->file, kiocb, &s_data->iter);
+	else
+		ret = -EINVAL;
+
+	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
+		req->flags &= ~REQ_F_REISSUE;
+
+		/* IOPOLL retry should happen for io-wq threads */
+		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
+			goto done;
+		/* no retry on NONBLOCK nor RWF_NOWAIT */
+		if (req->flags & REQ_F_NOWAIT)
+			goto done;
+		ret = 0;
+	} else if (ret == -EIOCBQUEUED) {
+		ret = IOU_ISSUE_SKIP_COMPLETE;
+		goto out_free;
+	}
+
+done:
+	/* it's faster to check here then delegate to kfree */
+	if (iovec[DATA])
+		kfree(iovec[DATA]);
+	if (iovec[PI])
+		kfree(iovec[PI]);
+	return kiocb_done(req, ret, issue_flags);
+out_free:
+	if (iovec[DATA])
+		kfree(iovec[DATA]);
+	if (iovec[PI])
+		kfree(iovec[PI]);
+	return ret;
+}
+
+int io_writev_pi(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw_pi_state s;
+	struct io_rw_state *s_data;
+	struct __io_rw_pi_state *s_pi;
+	struct iovec *iovec[2];
+	struct kiocb *kiocb = io_kiocb_to_kiocb(req, struct io_rw_pi);
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	ssize_t ret, ret2;
+	loff_t *ppos;
+
+	if (!req_has_async_data(req)) {
+		s_data = &s.data;
+		s_pi = &s.pi;
+		ret = io_import_iovecs_pi(WRITE, req, iovec, s_data, s_pi);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		struct io_async_rw_pi *arw = req->async_data;
+
+		iovec[DATA] = iovec[PI] = 0;
+		s_data = &arw->data.s;
+		s_pi = &arw->pi.s;
+		io_rw_pi_state_iter_restore(s_data, s_pi);
+	}
+	kiocb->private = &s_pi->iter;
+
+	ret = io_rw_pi_init_file(req, FMODE_WRITE);
+	if (unlikely(ret))
+		goto out_free;
+
+	req->cqe.res = iov_iter_count(&s_data->iter);
+
+	if (force_nonblock) {
+		/* If the file doesn't support async, just async punt */
+		if (unlikely(!io_file_supports_nowait(req)))
+			goto copy_iov;
+
+		kiocb->ki_flags |= IOCB_NOWAIT;
+	} else {
+		/* Ensure we clear previously set non-block flag */
+		kiocb->ki_flags &= ~IOCB_NOWAIT;
+	}
+
+	ppos = io_kiocb_update_pos(req);
+
+	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
+	if (unlikely(ret))
+		goto out_free;
+
+	kiocb->ki_flags |= IOCB_WRITE;
+
+	if (likely(req->file->f_op->write_iter))
+		ret2 = call_write_iter(req->file, kiocb, &s_data->iter);
+	else
+		ret2 = -EINVAL;
+
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		ret2 = -EAGAIN;
+	}
+
+	/*
+	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+	 * retry them without IOCB_NOWAIT.
+	 */
+	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+		ret2 = -EAGAIN;
+	/* no retry on NONBLOCK nor RWF_NOWAIT */
+	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
+		goto done;
+	if (!force_nonblock || ret2 != -EAGAIN) {
+		if (ret2 == -EIOCBQUEUED) {
+			ret = IOU_ISSUE_SKIP_COMPLETE;
+			goto out_free;
+		}
+		/* IOPOLL retry should happen for io-wq threads */
+		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
+			goto copy_iov;
+
+done:
+		ret = kiocb_done(req, ret2, issue_flags);
+	} else {
+copy_iov:
+		io_rw_pi_state_iter_restore(s_data, s_pi);
+		ret = io_setup_async_rw_pi(req, iovec, s_data, s_pi);
+		return ret ?: -EAGAIN;
+	}
+out_free:
+	/* it's reportedly faster than delegating the null check to kfree() */
+	if (iovec[DATA])
+		kfree(iovec[DATA]);
+	if (iovec[PI])
+		kfree(iovec[PI]);
+	return ret;
+}
+
diff --git a/io_uring/rw_pi.h b/io_uring/rw_pi.h
new file mode 100644
index 000000000000..f635da982484
--- /dev/null
+++ b/io_uring/rw_pi.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_RW_PI_H
+#define IOU_RW_PI_H
+
+#include "rw.h"
+
+struct __io_rw_pi_state {
+	struct iov_iter			iter;
+	struct iov_iter_state		iter_state;
+	struct iovec			fast_iov[UIO_FASTIOV_PI];
+};
+
+struct io_rw_pi_state {
+	struct io_rw_state		data;
+	struct __io_rw_pi_state		pi;
+};
+
+struct __io_async_rw_pi {
+	const struct iovec		*free_iovec;
+	struct __io_rw_pi_state		s;
+};
+
+struct io_async_rw_pi {
+	struct io_async_rw		data;
+	struct __io_async_rw_pi		pi;
+};
+
+int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_readv_pi(struct io_kiocb *req, unsigned int issue_flags);
+int io_readv_pi_prep_async(struct io_kiocb *req);
+int io_writev_pi(struct io_kiocb *req, unsigned int issue_flags);
+int io_writev_pi_prep_async(struct io_kiocb *req);
+void io_readv_writev_pi_cleanup(struct io_kiocb *req);
+#endif
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v5 3/3] block: fops: handle IOCB_USE_PI in direct IO
  2022-09-20 14:46 [PATCH v5 0/3] implement direct IO with integrity Alexander V. Buev
  2022-09-20 14:46 ` [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
  2022-09-20 14:46 ` [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
@ 2022-09-20 14:46 ` Alexander V. Buev
  2022-09-20 20:12 ` [PATCH v5 0/3] implement direct IO with integrity Keith Busch
  3 siblings, 0 replies; 11+ messages in thread
From: Alexander V. Buev @ 2022-09-20 14:46 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

Check that the size of PI data correspond to device integrity profile
and data size.
Add PI data to device BIO.

Signed-off-by: Alexander V. Buev <[email protected]>
---
 block/fops.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/block/fops.c b/block/fops.c
index b90742595317..d89fa7d99635 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -16,6 +16,7 @@
 #include <linux/suspend.h>
 #include <linux/fs.h>
 #include <linux/module.h>
+#include <linux/blk-integrity.h>
 #include "blk.h"
 
 static inline struct inode *bdev_file_inode(struct file *file)
@@ -51,6 +52,19 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 
 #define DIO_INLINE_BIO_VECS 4
 
+static int __bio_integrity_add_iovec(struct bio *bio, struct iov_iter *pi_iter)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int pi_len = bio_integrity_bytes(bi, bio->bi_iter.bi_size >> SECTOR_SHIFT);
+	size_t iter_count = pi_iter->count-pi_len;
+	int ret;
+
+	iov_iter_truncate(pi_iter, pi_len);
+	ret = bio_integrity_add_iovec(bio, pi_iter);
+	iov_iter_reexpand(pi_iter, iter_count);
+	return ret;
+}
+
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		struct iov_iter *iter, unsigned int nr_pages)
 {
@@ -94,6 +108,15 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		bio.bi_opf |= REQ_NOWAIT;
 
+	if (iocb->ki_flags & IOCB_USE_PI) {
+		ret = __bio_integrity_add_iovec(&bio, (struct iov_iter *)iocb->private);
+		WRITE_ONCE(iocb->private, NULL);
+		if (ret) {
+			bio_release_pages(&bio, should_dirty);
+			goto out;
+		}
+	}
+
 	submit_bio_wait(&bio);
 
 	bio_release_pages(&bio, should_dirty);
@@ -178,6 +201,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
+	struct iov_iter *pi_iter = 0;
 
 	if (blkdev_dio_unaligned(bdev, pos, iter))
 		return -EINVAL;
@@ -235,6 +259,19 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		pos += bio->bi_iter.bi_size;
 
 		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
+
+		if (iocb->ki_flags & IOCB_USE_PI) {
+			if (!pi_iter)
+				pi_iter = (struct iov_iter *)iocb->private;
+			ret = __bio_integrity_add_iovec(bio, pi_iter);
+			WRITE_ONCE(iocb->private, NULL);
+			if (unlikely(ret)) {
+				bio->bi_status = BLK_STS_IOERR;
+				bio_endio(bio);
+				break;
+			}
+		}
+
 		if (!nr_pages) {
 			submit_bio(bio);
 			break;
@@ -343,6 +380,16 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		task_io_account_write(bio->bi_iter.bi_size);
 	}
 
+	if (iocb->ki_flags & IOCB_USE_PI) {
+		ret = __bio_integrity_add_iovec(bio, (struct iov_iter *)iocb->private);
+		WRITE_ONCE(iocb->private, NULL);
+		if (ret) {
+			bio->bi_status = BLK_STS_IOERR;
+			bio_endio(bio);
+			return -EIOCBQUEUED;
+		}
+	}
+
 	if (iocb->ki_flags & IOCB_HIPRI) {
 		bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
 		submit_bio(bio);
@@ -355,6 +402,31 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	return -EIOCBQUEUED;
 }
 
+static inline int
+blkdev_check_pi(struct block_device *bdev, size_t data_size, size_t pi_size)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+	unsigned int intervals;
+
+	if (unlikely(!(bi && bi->tuple_size &&
+			bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE))) {
+		pr_err("Device %d:%d is not integrity capable",
+			MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+		return -EINVAL;
+	}
+
+	intervals = bio_integrity_intervals(bi, data_size >> SECTOR_SHIFT);
+	if (unlikely(intervals * bi->tuple_size > pi_size)) {
+		pr_err("Device %d:%d integrity & data size mismatch",
+			MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+		pr_err("data=%zu integrity=%zu intervals=%u tuple=%u",
+			data_size, pi_size,
+			intervals, bi->tuple_size);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	unsigned int nr_pages;
@@ -362,6 +434,14 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (!iov_iter_count(iter))
 		return 0;
 
+	if (iocb->ki_flags & IOCB_USE_PI) {
+		struct block_device *bdev = iocb->ki_filp->private_data;
+		struct iov_iter *pi_iter = iocb->private;
+
+		if (blkdev_check_pi(bdev, iter->count, pi_iter->count))
+			return -EINVAL;
+	}
+
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
 	if (likely(nr_pages <= BIO_MAX_VECS)) {
 		if (is_sync_kiocb(iocb))
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 0/3] implement direct IO with integrity
  2022-09-20 14:46 [PATCH v5 0/3] implement direct IO with integrity Alexander V. Buev
                   ` (2 preceding siblings ...)
  2022-09-20 14:46 ` [PATCH v5 3/3] block: fops: handle IOCB_USE_PI in direct IO Alexander V. Buev
@ 2022-09-20 20:12 ` Keith Busch
  2022-09-21  9:26   ` Alexander V. Buev
  3 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2022-09-20 20:12 UTC (permalink / raw)
  To: Alexander V. Buev
  Cc: linux-block, io-uring, Jens Axboe, Christoph Hellwig,
	Martin K . Petersen, Pavel Begunkov, Chaitanya Kulkarni,
	Mikhail Malygin, linux

On Tue, Sep 20, 2022 at 05:46:15PM +0300, Alexander V. Buev wrote:
> This series of patches makes possible to do direct block IO
> with integrity payload using io uring kernel interface.
> Userspace app can utilize new READV_PI/WRITEV_PI operation with a new
> fields in sqe struct (pi_addr/pi_len) to provide iovec's with
> integrity data.

Is this really intended to be used exclusively for PI? Once you give use space
access to extended metadata regions, they can use it for whatever the user
wants, which may not be related to protection information formats. Perhaps a
more generic suffix than "_PI" may be appropriate like _EXT or _META?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 0/3] implement direct IO with integrity
  2022-09-20 20:12 ` [PATCH v5 0/3] implement direct IO with integrity Keith Busch
@ 2022-09-21  9:26   ` Alexander V. Buev
  2022-09-22 14:09     ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Alexander V. Buev @ 2022-09-21  9:26 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, io-uring, Jens Axboe, Christoph Hellwig,
	Martin K . Petersen, Pavel Begunkov, Chaitanya Kulkarni,
	Mikhail Malygin, linux

> «Внимание! Данное письмо от внешнего адресата!»
> 
> On Tue, Sep 20, 2022 at 05:46:15PM +0300, Alexander V. Buev wrote:
> > This series of patches makes possible to do direct block IO
> > with integrity payload using io uring kernel interface.
> > Userspace app can utilize new READV_PI/WRITEV_PI operation with a new
> > fields in sqe struct (pi_addr/pi_len) to provide iovec's with
> > integrity data.
> 
> Is this really intended to be used exclusively for PI? Once you give use space
> access to extended metadata regions, they can use it for whatever the user
> wants, which may not be related to protection information formats. Perhaps a
> more generic suffix than "_PI" may be appropriate like _EXT or _META?

Currently we use this code for transfer block IO with meta information 
from user space to special block device driver. This meta information includes PI and some other
information that helps driver to process IO with some optimization, 
special option and etc. In the near feature we can extend this info die to increased
requirements for our product.

Also we can use this code for transfer IO with PI information from user space
to supported block devices such as nvme & scsi.

And you are right. Just for me "_meta" is more appropriate and abstract suffix for this,
but:

 1. "PI" is shortly
 2. "PI" and "integrity" is widely used in block layer code and I decided that
    if it's called PI - everyone understands what exactly it is about.
 3. User can read/write general info only in case of using special block layer driver. 

Anyway I'm ready to rename this things.

May be it's enough to rename only userspace visible part?
(sqe struct members & op codes)


-- 
Alexander V. Buev

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations
  2022-09-20 14:46 ` [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
@ 2022-09-21 17:59   ` Jens Axboe
  2022-09-22 12:48     ` Alexander V. Buev
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2022-09-21 17:59 UTC (permalink / raw)
  To: Alexander V. Buev, linux-block
  Cc: io-uring, Christoph Hellwig, Martin K . Petersen, Pavel Begunkov,
	Chaitanya Kulkarni, Mikhail Malygin, linux

On 9/20/22 8:46 AM, Alexander V. Buev wrote:
> Added new READV_PI/WRITEV_PI operations to io_uring.
> Added new pi_addr & pi_len fields to SQE struct.
> Added new IOCB_USE_PI flag to kiocb struct.
> Use kiocb->private pointer to pass PI data
> iterator to low layer.

Minor nit - please format commit message lines to 72-74 chars.

In general, I think this feature is useful. I do echo Keith's response
that it should probably be named a bit differently, as PI is just one
use case of this.

But for this patch in particular, not a huge fan of the rote copying of
rw.c into a new file. Now we have to patch two different spots whenever
a bug is found in there, that's not very maintainable. I do appreciate
the fact that this keeps the PI work out of the fast path for
read/write, but I do think this warrants a bit of refactoring work first
to ensure that there are helpers that can be shared between rw and
rw_pi. That definitely needs to be solved before this can be considered
for inclusion.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations
  2022-09-21 17:59   ` Jens Axboe
@ 2022-09-22 12:48     ` Alexander V. Buev
  2022-09-22 14:08       ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Alexander V. Buev @ 2022-09-22 12:48 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, io-uring, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux

> In general, I think this feature is useful. I do echo Keith's response
> that it should probably be named a bit differently, as PI is just one
> use case of this.
Accepted. 
In the next version, this suffix "pi" will be renamed to "meta"
(meta_addr, meta_len, READV_META, WRITEV_META and etc...)


> But for this patch in particular, not a huge fan of the rote copying of
> rw.c into a new file. Now we have to patch two different spots whenever
> a bug is found in there, that's not very maintainable. I do appreciate
> the fact that this keeps the PI work out of the fast path for
> read/write, but I do think this warrants a bit of refactoring work first
> to ensure that there are helpers that can be shared between rw and
> rw_pi. That definitely needs to be solved before this can be considered
> for inclusion.
I think it would be better to move some of the shared code to another file. 
For example "rw_common.[ch]". What do you think about?
As an alternative I can leave such code in "rw.[ch]" file as is.

-- 
Alexander V. Buev

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations
  2022-09-22 12:48     ` Alexander V. Buev
@ 2022-09-22 14:08       ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2022-09-22 14:08 UTC (permalink / raw)
  To: Alexander V. Buev, linux-block, io-uring, Christoph Hellwig,
	Martin K . Petersen, Pavel Begunkov, Chaitanya Kulkarni,
	Mikhail Malygin, linux

>> But for this patch in particular, not a huge fan of the rote copying of
>> rw.c into a new file. Now we have to patch two different spots whenever
>> a bug is found in there, that's not very maintainable. I do appreciate
>> the fact that this keeps the PI work out of the fast path for
>> read/write, but I do think this warrants a bit of refactoring work first
>> to ensure that there are helpers that can be shared between rw and
>> rw_pi. That definitely needs to be solved before this can be considered
>> for inclusion.
> I think it would be better to move some of the shared code to another file. 
> For example "rw_common.[ch]". What do you think about?
> As an alternative I can leave such code in "rw.[ch]" file as is.

That's basically what I'm suggesting, at least that would be one way to
do it. And the best one imho. So yes, let's go ahead and do that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 0/3] implement direct IO with integrity
  2022-09-21  9:26   ` Alexander V. Buev
@ 2022-09-22 14:09     ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2022-09-22 14:09 UTC (permalink / raw)
  To: Alexander V. Buev, Keith Busch, linux-block, io-uring,
	Christoph Hellwig, Martin K . Petersen, Pavel Begunkov,
	Chaitanya Kulkarni, Mikhail Malygin, linux

On 9/21/22 3:26 AM, Alexander V. Buev wrote:
>> ?????????! ?????? ?????? ?? ???????? ????????!?
>>
>> On Tue, Sep 20, 2022 at 05:46:15PM +0300, Alexander V. Buev wrote:
>>> This series of patches makes possible to do direct block IO
>>> with integrity payload using io uring kernel interface.
>>> Userspace app can utilize new READV_PI/WRITEV_PI operation with a new
>>> fields in sqe struct (pi_addr/pi_len) to provide iovec's with
>>> integrity data.
>>
>> Is this really intended to be used exclusively for PI? Once you give use space
>> access to extended metadata regions, they can use it for whatever the user
>> wants, which may not be related to protection information formats. Perhaps a
>> more generic suffix than "_PI" may be appropriate like _EXT or _META?
> 
> Currently we use this code for transfer block IO with meta information 
> from user space to special block device driver. This meta information includes PI and some other
> information that helps driver to process IO with some optimization, 
> special option and etc. In the near feature we can extend this info die to increased
> requirements for our product.
> 
> Also we can use this code for transfer IO with PI information from user space
> to supported block devices such as nvme & scsi.
> 
> And you are right. Just for me "_meta" is more appropriate and abstract suffix for this,
> but:
> 
>  1. "PI" is shortly
>  2. "PI" and "integrity" is widely used in block layer code and I decided that
>     if it's called PI - everyone understands what exactly it is about.
>  3. User can read/write general info only in case of using special block layer driver. 
> 
> Anyway I'm ready to rename this things.
> 
> May be it's enough to rename only userspace visible part?
> (sqe struct members & op codes)

Let's please make it consistent across the userspace and internal parts
in terms of naming.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio
  2022-09-20 14:46 ` [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
@ 2022-09-27  7:47   ` Christoph Hellwig
  0 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2022-09-27  7:47 UTC (permalink / raw)
  To: Alexander V. Buev
  Cc: linux-block, io-uring, Jens Axboe, Christoph Hellwig,
	Martin K . Petersen, Pavel Begunkov, Chaitanya Kulkarni,
	Mikhail Malygin, linux

On Tue, Sep 20, 2022 at 05:46:16PM +0300, Alexander V. Buev wrote:
> Added functions to attach user PI iovec pages to bio and release this
> pages via bio_integrity_free.

I'd much prefer if for the first version we could just go down the
copy_from/to_user route as mentioned below and avoid the extra
complexity of the get_user_pages path.  Once we get the interface
right we can consider adding the get_user_pages version back if
we have benchmarks for relevant workloads that justify it.  In that
case we'll also need into refactoring thing so that more code is
shared with the bio data path.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2022-09-27  7:47 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-09-20 14:46 [PATCH v5 0/3] implement direct IO with integrity Alexander V. Buev
2022-09-20 14:46 ` [PATCH v5 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
2022-09-27  7:47   ` Christoph Hellwig
2022-09-20 14:46 ` [PATCH v5 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
2022-09-21 17:59   ` Jens Axboe
2022-09-22 12:48     ` Alexander V. Buev
2022-09-22 14:08       ` Jens Axboe
2022-09-20 14:46 ` [PATCH v5 3/3] block: fops: handle IOCB_USE_PI in direct IO Alexander V. Buev
2022-09-20 20:12 ` [PATCH v5 0/3] implement direct IO with integrity Keith Busch
2022-09-21  9:26   ` Alexander V. Buev
2022-09-22 14:09     ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox