public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v4 0/3] implement direct IO with integrity
@ 2022-09-09 12:20 Alexander V. Buev
  2022-09-09 12:20 ` [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Alexander V. Buev @ 2022-09-09 12:20 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

This series of patches makes possible to do direct block IO
with integrity payload using io uring kernel interface.
Userspace app can utilize new READV_PI/WRITEV_PI operation with a new
fields in sqe struct (pi_addr/pi_len) to provide iovec's with
integrity data.

Changes since v3:
 - fixed warnings reported by robot 

Changes since v2:
 - separate code from fast path
 - keep rw_pi struct size <= 64 byte
 - using kiocb->private pointer to pass
   PI data iterator to block direct IO layer   
 - improved bio_integrity_add_iovec function 

Alexander V. Buev (3):
  block: bio-integrity: add PI iovec to bio
  block: io-uring: add READV_PI/WRITEV_PI operations
  block: fops: handle IOCB_USE_PI in direct IO

 block/bio-integrity.c         | 163 +++++++++
 block/fops.c                  |  80 +++++
 include/linux/bio.h           |   8 +
 include/linux/fs.h            |   1 +
 include/uapi/linux/io_uring.h |   6 +
 include/uapi/linux/uio.h      |   3 +-
 io_uring/Makefile             |   3 +-
 io_uring/io_uring.c           |   2 +
 io_uring/opdef.c              |  27 ++
 io_uring/rw.h                 |   4 +
 io_uring/rw_pi.c              | 619 ++++++++++++++++++++++++++++++++++
 io_uring/rw_pi.h              |  34 ++
 12 files changed, 948 insertions(+), 2 deletions(-)
 create mode 100644 io_uring/rw_pi.c
 create mode 100644 io_uring/rw_pi.h

-- 
2.30.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio
  2022-09-09 12:20 [PATCH v4 0/3] implement direct IO with integrity Alexander V. Buev
@ 2022-09-09 12:20 ` Alexander V. Buev
  2022-09-09 14:38   ` Christoph Hellwig
  2022-09-09 12:20 ` [PATCH v4 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
  2022-09-09 12:20 ` [PATCH v4 3/3] block: fops: handle IOCB_USE_PI in direct IO Alexander V. Buev
  2 siblings, 1 reply; 7+ messages in thread
From: Alexander V. Buev @ 2022-09-09 12:20 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

Added functions to attach user PI iovec pages to bio and release this
pages via bio_integrity_free.

Signed-off-by: Alexander V. Buev <[email protected]>
---
 block/bio-integrity.c | 163 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h   |   8 +++
 2 files changed, 171 insertions(+)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 3f5685c00e36..bd6b74ae2c95 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -10,6 +10,7 @@
 #include <linux/mempool.h>
 #include <linux/export.h>
 #include <linux/bio.h>
+#include <linux/uio.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include "blk.h"
@@ -91,6 +92,18 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+void bio_integrity_release_pages(struct bio *bio)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct bio_vec *bv = bip->bip_vec;
+	unsigned short i;
+
+	for (i = 0; i < bip->bip_vcnt; i++) {
+		put_page(bv->bv_page);
+		bv++;
+	}
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +118,10 @@ void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	else {
+		if (bip->bip_flags & BIP_RELEASE_PAGES)
+			bio_integrity_release_pages(bio);
+	}
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -378,6 +395,152 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
 }
 
+static inline
+struct page **__bio_integrity_temp_pages(struct bio *bio, unsigned int nr_needed_page)
+{
+	unsigned int nr_avail_page = 0;
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+
+	if (bip->bip_max_vcnt > nr_needed_page) {
+		nr_avail_page = (bip->bip_max_vcnt - nr_needed_page) *
+			sizeof(struct bio_vec)/sizeof(struct page *);
+	}
+
+	if (nr_avail_page >= nr_needed_page)
+		return (struct page **) (bip->bip_vec + nr_needed_page);
+	else {
+		if (bio->bi_max_vecs - bio->bi_vcnt) {
+			nr_avail_page = (bio->bi_max_vecs - bio->bi_vcnt) *
+				sizeof(struct bio_vec)/sizeof(struct page *);
+			if (nr_avail_page >= nr_needed_page)
+				return (struct page **) (bio->bi_io_vec + bio->bi_vcnt);
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * bio_integrity_add_iovec - Add PI io vector
+ * @bio:	bio whose integrity vector to update
+ * @pi_iter:	iov_iter pointed to data added to @bio's integrity
+ *
+ * Description: Pins pages for *pi_iov and appends them to @bio's integrity.
+ */
+int bio_integrity_add_iovec(struct bio *bio, struct iov_iter *pi_iter)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct bio_integrity_payload *bip;
+	struct page **pi_page = 0, **bio_page;
+	unsigned int nr_vec_page;
+	int ret;
+	ssize_t size;
+	size_t offset, pg_num, page_count;
+
+	if (unlikely(!(bi && bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE))) {
+		pr_err("Device %d:%d is not integrity capable",
+			MAJOR(bio->bi_bdev->bd_dev), MINOR(bio->bi_bdev->bd_dev));
+		return -EINVAL;
+	}
+
+	nr_vec_page = iov_iter_npages(pi_iter,
+		queue_max_integrity_segments(bdev_get_queue(bio->bi_bdev)));
+	bip = bio_integrity(bio);
+	if (bip) {
+		if (nr_vec_page > (bip->bip_max_vcnt - bip->bip_vcnt))
+			return -ENOMEM;
+	} else {
+		bip = bio_integrity_alloc(bio, GFP_NOIO, nr_vec_page);
+		if (IS_ERR(bip))
+			return PTR_ERR(bip);
+	}
+
+	/* get space for page pointers array */
+	bio_page = __bio_integrity_temp_pages(bio, nr_vec_page);
+
+	if (likely(bio_page))
+		pi_page = bio_page;
+	else {
+		pi_page = kcalloc(nr_vec_page,
+					sizeof(struct pi_page *), GFP_NOIO);
+		if (!pi_page) {
+			ret = -ENOMEM;
+			goto error;
+		}
+	}
+
+	bip->bip_iter.bi_size = pi_iter->count;
+	bip->bio_iter = bio->bi_iter;
+	bip_set_seed(bip, bio->bi_iter.bi_sector);
+
+	if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
+		bip->bip_flags |= BIP_IP_CHECKSUM;
+
+	do {
+		size = iov_iter_get_pages2(pi_iter, pi_page, LONG_MAX,
+						nr_vec_page, &offset);
+		if (unlikely(size <= 0)) {
+			pr_err("Failed to pin integrity buffer for %d:%d\n",
+				MAJOR(bio->bi_bdev->bd_dev),
+				MINOR(bio->bi_bdev->bd_dev));
+			pr_err("Buffer size=%zu pages=%u err=%zi\n",
+				pi_iter->count, nr_vec_page, size);
+			ret = (size) ? size : -EFAULT;
+			goto error;
+		}
+
+		page_count = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+
+		/* fill bio integrity biovecs the given pages */
+		for (pg_num = 0; pg_num < page_count; ++pg_num) {
+			size_t page_len;
+
+			page_len = min_t(size_t, PAGE_SIZE - offset, size);
+			ret = bio_integrity_add_page(bio, pi_page[pg_num],
+							page_len, offset);
+			if (unlikely(ret != page_len)) {
+				while ((1 + pg_num) > 0) {
+					put_page(pi_page[pg_num]);
+					pg_num--;
+				}
+				ret = -ENOMEM;
+				goto error;
+			}
+			size -= page_len;
+			offset = 0;
+			bip->bip_flags |= BIP_RELEASE_PAGES;
+		}
+
+		nr_vec_page -= page_count;
+
+	} while (pi_iter->count && nr_vec_page);
+
+
+	if (pi_iter->count) {
+		pr_err("Failed to pin whole integrity buffer for %d:%d\n",
+			MAJOR(bio->bi_bdev->bd_dev),
+			MINOR(bio->bi_bdev->bd_dev));
+		pr_err("Data of size=%zi not pined\n", pi_iter->count);
+		ret = -EFAULT;
+		goto error;
+	}
+
+	if (pi_page != bio_page)
+		kfree(pi_page);
+
+	return 0;
+
+error:
+	if (bio_integrity(bio))
+		bio_integrity_free(bio);
+
+	if (pi_page && pi_page != bio_page)
+		kfree(pi_page);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_add_iovec);
+
 /**
  * bio_integrity_trim - Trim integrity vector
  * @bio:	bio whose integrity vector to update
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ca22b06700a9..e7e328425c90 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -317,6 +317,7 @@ enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_RELEASE_PAGES	= 1 << 5, /* release pages after io completion */
 };
 
 /*
@@ -699,6 +700,7 @@ extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, un
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern bool bio_integrity_prep(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
+extern int bio_integrity_add_iovec(struct bio *bio, struct iov_iter *iter);
 extern void bio_integrity_trim(struct bio *);
 extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
 extern int bioset_integrity_create(struct bio_set *, int);
@@ -739,6 +741,12 @@ static inline void bio_integrity_advance(struct bio *bio,
 	return;
 }
 
+static inline int bio_integrity_add_iovec(struct bio *bio,
+					struct iov_iter *pi_iter)
+{
+	return 0;
+}
+
 static inline void bio_integrity_trim(struct bio *bio)
 {
 	return;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v4 2/3] block: io-uring: add READV_PI/WRITEV_PI operations
  2022-09-09 12:20 [PATCH v4 0/3] implement direct IO with integrity Alexander V. Buev
  2022-09-09 12:20 ` [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
@ 2022-09-09 12:20 ` Alexander V. Buev
  2022-09-15 23:22   ` kernel test robot
  2022-09-09 12:20 ` [PATCH v4 3/3] block: fops: handle IOCB_USE_PI in direct IO Alexander V. Buev
  2 siblings, 1 reply; 7+ messages in thread
From: Alexander V. Buev @ 2022-09-09 12:20 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

Added new READV_PI/WRITEV_PI operations to io_uring.
Added new pi_addr & pi_len fields to SQE struct.
Added new IOCB_USE_PI flag to kiocb struct.
Use kiocb->private pointer to pass PI data
iterator to low layer.

Signed-off-by: Alexander V. Buev <[email protected]>
---
 include/linux/fs.h            |   1 +
 include/uapi/linux/io_uring.h |   6 +
 include/uapi/linux/uio.h      |   3 +-
 io_uring/Makefile             |   3 +-
 io_uring/io_uring.c           |   2 +
 io_uring/opdef.c              |  27 ++
 io_uring/rw.h                 |   4 +
 io_uring/rw_pi.c              | 619 ++++++++++++++++++++++++++++++++++
 io_uring/rw_pi.h              |  34 ++
 9 files changed, 697 insertions(+), 2 deletions(-)
 create mode 100644 io_uring/rw_pi.c
 create mode 100644 io_uring/rw_pi.h

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..a28b12a22750 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -337,6 +337,7 @@ enum rw_hint {
 #define IOCB_NOIO		(1 << 20)
 /* can use bio alloc cache */
 #define IOCB_ALLOC_CACHE	(1 << 21)
+#define IOCB_USE_PI		(1 << 22)
 
 struct kiocb {
 	struct file		*ki_filp;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6b83177fd41d..a4158e48cecb 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -80,6 +80,10 @@ struct io_uring_sqe {
 			__u64	addr3;
 			__u64	__pad2[1];
 		};
+		struct {
+			__u64	pi_addr;
+			__u32	pi_len;
+		};
 		/*
 		 * If the ring is initialized with IORING_SETUP_SQE128, then
 		 * this field is used for 80 bytes of arbitrary command data
@@ -206,6 +210,8 @@ enum io_uring_op {
 	IORING_OP_SOCKET,
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
+	IORING_OP_READV_PI,
+	IORING_OP_WRITEV_PI,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 059b1a9147f4..c9eaaa6cdb0f 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -23,9 +23,10 @@ struct iovec
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
- 
+
 #define UIO_FASTIOV	8
 #define UIO_MAXIOV	1024
+#define UIO_FASTIOV_PI	1
 
 
 #endif /* _UAPI__LINUX_UIO_H */
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75..8c01546c2bcf 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o \
+					notif.o rw_pi.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f9be9b7eb654..1736a85f8c95 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3926,7 +3926,9 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
 	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
 	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
+	BUILD_BUG_SQE_ELEM(48, __u64,  pi_addr);
 	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+	BUILD_BUG_SQE_ELEM(56, __u32,  pi_len);
 	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index c61494e0a602..da2b12a44995 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
 #include "poll.h"
 #include "cancel.h"
 #include "rw.h"
+#include "rw_pi.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -488,6 +489,32 @@ const struct io_op_def io_op_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READV_PI] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw_pi),
+		.name			= "READV_PI",
+		.prep			= io_prep_rw_pi,
+		.issue			= io_readv_pi,
+		.prep_async		= io_readv_pi_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+	},
+	[IORING_OP_WRITEV_PI] = {
+		.needs_file		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.async_size		= sizeof(struct io_async_rw_pi),
+		.name			= "WRITEV_PI",
+		.prep			= io_prep_rw_pi,
+		.issue			= io_writev_pi,
+		.prep_async		= io_writev_pi_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 0204c3fcafa5..c00ece398540 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_RW_H
+#define IOU_RW_H
 
 #include <linux/pagemap.h>
 
@@ -21,3 +23,5 @@ int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
 int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
+
+#endif
diff --git a/io_uring/rw_pi.c b/io_uring/rw_pi.c
new file mode 100644
index 000000000000..58ea9fcd062c
--- /dev/null
+++ b/io_uring/rw_pi.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fsnotify.h>
+#include <linux/poll.h>
+#include <linux/nospec.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "opdef.h"
+#include "kbuf.h"
+#include "rsrc.h"
+#include "rw_pi.h"
+
+#define io_kiocb_to_kiocb(req, type) \
+				(&((type *)io_kiocb_to_cmd(req, type))->kiocb)
+#define DATA	(0)
+#define PI	(1)
+
+struct io_rw_pi {
+	struct kiocb			kiocb;
+	u64				addr;
+	u32				nr_segs;
+	u32				nr_pi_segs;
+};
+
+static inline
+void io_rw_pi_state_iter_restore(struct io_rw_state *data, struct __io_rw_pi_state *pi)
+{
+	iov_iter_restore(&data->iter, &data->iter_state);
+	iov_iter_restore(&pi->iter, &pi->iter_state);
+}
+
+static inline
+void io_rw_pi_state_iter_save(struct io_rw_state *data, struct __io_rw_pi_state *pi)
+{
+	iov_iter_save_state(&data->iter, &data->iter_state);
+	iov_iter_save_state(&pi->iter, &pi->iter_state);
+}
+
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_SUPPORT_NOWAIT;
+}
+
+static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+{
+	switch (ret) {
+	case -EIOCBQUEUED:
+		break;
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/*
+		 * We can't just restart the syscall, since previously
+		 * submitted sqes may already be in progress. Just fail this
+		 * IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		kiocb->ki_complete(kiocb, ret);
+	}
+}
+
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+
+	if (rw->kiocb.ki_pos != -1)
+		return &rw->kiocb.ki_pos;
+
+	if (!(req->file->f_mode & FMODE_STREAM)) {
+		req->flags |= REQ_F_CUR_POS;
+		rw->kiocb.ki_pos = req->file->f_pos;
+		return &rw->kiocb.ki_pos;
+	}
+
+	rw->kiocb.ki_pos = 0;
+	return NULL;
+}
+
+static void io_req_task_queue_reissue(struct io_kiocb *req)
+{
+	req->io_task_work.func = io_queue_iowq;
+	io_req_task_work_add(req);
+}
+
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	struct io_async_rw_pi *arw = req->async_data;
+
+	if (!req_has_async_data(req))
+		return !io_req_prep_async(req);
+	io_rw_pi_state_iter_restore(&arw->data.s, &arw->pi.s);
+	return true;
+}
+
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+	    !(ctx->flags & IORING_SETUP_IOPOLL)))
+		return false;
+	/*
+	 * If ref is dying, we might be running poll reap from the exit work.
+	 * Don't attempt to reissue from that path, just let it fail with
+	 * -EAGAIN.
+	 */
+	if (percpu_ref_is_dying(&ctx->refs))
+		return false;
+	/*
+	 * Play it safe and assume not safe to re-import and reissue if we're
+	 * not in the original thread group (or in task context).
+	 */
+	if (!same_thread_group(req->task, current) || !in_task())
+		return false;
+	return true;
+}
+
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+
+	if (rw->kiocb.ki_flags & IOCB_WRITE)
+		fsnotify_modify(req->file);
+	else
+		fsnotify_access(req->file);
+
+	if (unlikely(res != req->cqe.res)) {
+		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
+		    io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+			return true;
+		}
+		req_set_fail(req);
+		req->cqe.res = res;
+	}
+	return false;
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res)
+{
+	struct io_rw_pi *rw = container_of(kiocb, struct io_rw_pi, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
+
+	if (__io_complete_rw_common(req, res))
+		return;
+	io_req_set_res(req, res, 0);
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(req);
+}
+
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
+{
+	struct io_rw_pi *rw = container_of(kiocb, struct io_rw_pi, kiocb);
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
+
+	if (unlikely(res != req->cqe.res)) {
+		if (res == -EAGAIN && io_rw_should_reissue(req)) {
+			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+			return;
+		}
+		req->cqe.res = res;
+	}
+
+	/* order with io_iopoll_complete() checking ->iopoll_completed */
+	smp_store_release(&req->iopoll_completed, 1);
+}
+
+static int kiocb_done(struct io_kiocb *req, ssize_t ret,
+		       unsigned int issue_flags)
+{
+	struct io_async_rw_pi *arw = req->async_data;
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+
+	/* add previously done IO, if any */
+	if (req_has_async_data(req) && arw->data.bytes_done > 0) {
+		if (ret < 0)
+			ret = arw->data.bytes_done;
+		else
+			ret += arw->data.bytes_done;
+	}
+
+	if (req->flags & REQ_F_CUR_POS)
+		req->file->f_pos = rw->kiocb.ki_pos;
+	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
+		if (!__io_complete_rw_common(req, ret)) {
+			io_req_set_res(req, req->cqe.res, 0);
+			return IOU_OK;
+		}
+	} else {
+		io_rw_done(&rw->kiocb, ret);
+	}
+
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		if (io_resubmit_prep(req))
+			io_req_task_queue_reissue(req);
+		else
+			io_req_task_queue_fail(req, ret);
+	}
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
+int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+	struct kiocb *kiocb = &rw->kiocb;
+	unsigned int ioprio;
+	int ret;
+
+	kiocb->ki_flags = 0;
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		return ret;
+
+	kiocb->ki_pos = READ_ONCE(sqe->off);
+
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
+		if (ret)
+			return ret;
+
+		kiocb->ki_ioprio = ioprio;
+	} else {
+		kiocb->ki_ioprio = get_current_ioprio();
+	}
+
+	req->imu = NULL;
+
+	/* save data iovec pointer & len */
+	rw->addr = (uintptr_t)READ_ONCE(sqe->addr);
+	rw->nr_segs = READ_ONCE(sqe->len);
+
+	/* save pi iovec pointer & len */
+	rw->kiocb.private = u64_to_user_ptr(READ_ONCE(sqe->pi_addr));
+	rw->nr_pi_segs = READ_ONCE(sqe->pi_len);
+
+	kiocb->ki_flags |= IOCB_USE_PI;
+
+	return 0;
+}
+
+
+static inline int
+io_import_iovecs_pi(int io_dir, struct io_kiocb *req, struct iovec **iovec,
+			struct io_rw_state *s_data, struct __io_rw_pi_state *s_pi)
+{
+	struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
+	struct iovec __user *uvec;
+	ssize_t ret;
+
+	/* data */
+	uvec = (struct iovec *)u64_to_user_ptr(rw->addr);
+	iovec[DATA] = s_data->fast_iov;
+	ret = __import_iovec(io_dir, uvec, rw->nr_segs,
+				UIO_FASTIOV, iovec + DATA,
+				&s_data->iter, req->ctx->compat);
+
+	if (unlikely(ret <= 0))
+		return (ret) ? ret : -EINVAL;
+	/* pi */
+	uvec = (struct iovec *)rw->kiocb.private;
+	iovec[PI] = s_pi->fast_iov;
+	ret = __import_iovec(io_dir, uvec, rw->nr_pi_segs,
+				UIO_FASTIOV_PI, iovec + PI,
+				&s_pi->iter, req->ctx->compat);
+	if (unlikely(ret <= 0)) {
+		if (iovec[DATA])
+			kfree(iovec[DATA]);
+		return (ret) ? ret : -EINVAL;
+	}
+
+	/* save states */
+	io_rw_pi_state_iter_save(s_data, s_pi);
+
+	return 0;
+}
+
+static inline void
+io_setup_async_state(struct io_rw_state *async_s, const struct io_rw_state *s)
+{
+	unsigned int iov_off = 0;
+
+	async_s->iter.iov = async_s->fast_iov;
+	if (s->iter.iov != s->fast_iov) {
+		iov_off = s->iter.iov - s->fast_iov;
+		async_s->iter.iov += iov_off;
+	}
+	if (async_s->fast_iov != s->fast_iov) {
+		memcpy(async_s->fast_iov + iov_off, s->fast_iov + iov_off,
+			       sizeof(struct iovec) * s->iter.nr_segs);
+	}
+}
+
+static int
+io_setup_async_rw_pi(struct io_kiocb *req, struct iovec * const *iovec,
+			struct io_rw_state *s_data,
+			struct __io_rw_pi_state *s_pi)
+{
+	struct io_async_rw_pi *arw;
+
+	if (req_has_async_data(req))
+		return 0;
+
+	if (io_alloc_async_data(req))
+		return -ENOMEM;
+
+	arw = req->async_data;
+
+	/* data */
+	arw->data.s.iter = s_data->iter;
+	arw->data.free_iovec = iovec[DATA];
+	arw->data.bytes_done = 0;
+
+	if (iovec[DATA])
+		req->flags |= REQ_F_NEED_CLEANUP;
+	else
+		io_setup_async_state(&arw->data.s, s_data);
+
+	/* pi */
+	arw->pi.s.iter = s_pi->iter;
+	arw->pi.free_iovec = iovec[PI];
+
+	if (iovec[PI])
+		req->flags |= REQ_F_NEED_CLEANUP;
+	else {
+		io_setup_async_state((struct io_rw_state *)&arw->pi.s,
+					(const struct io_rw_state *)s_pi);
+	}
+
+	/* save states */
+	io_rw_pi_state_iter_save(&arw->data.s, &arw->pi.s);
+
+	return 0;
+}
+
+static inline int io_rw_pi_prep_async(struct io_kiocb *req, int io_dir)
+{
+	int ret = 0;
+	struct io_async_rw_pi *arw = req->async_data;
+	struct iovec *iovec[2];
+
+	ret = io_import_iovecs_pi(io_dir, req, iovec,
+					&arw->data.s, &arw->pi.s);
+	if (unlikely(ret < 0))
+		return ret;
+
+	arw->data.bytes_done = 0;
+	arw->data.free_iovec = iovec[DATA];
+	arw->pi.free_iovec = iovec[PI];
+
+	if (iovec[DATA] || iovec[PI])
+		req->flags |= REQ_F_NEED_CLEANUP;
+
+	return 0;
+}
+
+
+int io_readv_pi_prep_async(struct io_kiocb *req)
+{
+	return io_rw_pi_prep_async(req, READ);
+}
+
+int io_writev_pi_prep_async(struct io_kiocb *req)
+{
+	return io_rw_pi_prep_async(req, WRITE);
+}
+
+static int io_rw_pi_init_file(struct io_kiocb *req, fmode_t mode)
+{
+	struct kiocb *kiocb = io_kiocb_to_kiocb(req, struct io_rw_pi);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file = req->file;
+	int flags;
+
+	if (unlikely(!file || !(file->f_mode & mode)))
+		return -EBADF;
+
+	if (unlikely(!S_ISBLK(file_inode(req->file)->i_mode)))
+		return -EINVAL;
+
+	if (unlikely(!(file->f_flags & O_DIRECT)))
+		return -EINVAL;
+
+	if (!io_req_ffs_set(req))
+		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+	flags = kiocb->ki_flags;
+	kiocb->ki_flags = iocb_flags(file);
+	kiocb->ki_flags |= flags;
+
+	/*
+	 * If the file is marked O_NONBLOCK, still allow retry for it if it
+	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
+	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+	 */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
+		req->flags |= REQ_F_NOWAIT;
+
+	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
+			return -EOPNOTSUPP;
+
+		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+		kiocb->ki_complete = io_complete_rw_iopoll;
+		req->iopoll_completed = 0;
+	} else {
+		if (kiocb->ki_flags & IOCB_HIPRI)
+			return -EINVAL;
+		kiocb->ki_complete = io_complete_rw;
+	}
+
+	return 0;
+}
+
+void io_readv_writev_pi_cleanup(struct io_kiocb *req)
+{
+	struct io_async_rw_pi *arw = req->async_data;
+
+	kfree(arw->data.free_iovec);
+	kfree(arw->pi.free_iovec);
+}
+
+int io_readv_pi(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw_pi_state s;
+	struct io_rw_state *s_data;
+	struct __io_rw_pi_state *s_pi;
+	struct iovec *iovec[2];
+	struct kiocb *kiocb = io_kiocb_to_kiocb(req, struct io_rw_pi);
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	ssize_t ret;
+	loff_t *ppos;
+
+	if (!req_has_async_data(req)) {
+		s_data = &s.data;
+		s_pi = &s.pi;
+		ret = io_import_iovecs_pi(READ, req, iovec, s_data, s_pi);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		struct io_async_rw_pi *arw = req->async_data;
+
+		iovec[DATA] = iovec[PI] = 0;
+		s_data = &arw->data.s;
+		s_pi = &arw->pi.s;
+		io_rw_pi_state_iter_restore(s_data, s_pi);
+	}
+	kiocb->private = &s_pi->iter;
+
+	ret = io_rw_pi_init_file(req, FMODE_READ);
+	if (unlikely(ret))
+		goto out_free;
+
+	req->cqe.res = iov_iter_count(&s_data->iter);
+	if (force_nonblock) {
+		/* If the file doesn't support async, just async punt */
+		if (unlikely(!io_file_supports_nowait(req))) {
+			ret = io_setup_async_rw_pi(req, iovec, s_data, s_pi);
+			return ret ?: -EAGAIN;
+		}
+		kiocb->ki_flags |= IOCB_NOWAIT;
+	} else {
+		/* Ensure we clear previously set non-block flag */
+		kiocb->ki_flags &= ~IOCB_NOWAIT;
+	}
+
+	ppos = io_kiocb_update_pos(req);
+
+	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
+	if (unlikely(ret))
+		goto out_free;
+
+	if (likely(req->file->f_op->read_iter))
+		ret = call_read_iter(req->file, kiocb, &s_data->iter);
+	else
+		ret = -EINVAL;
+
+	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
+		req->flags &= ~REQ_F_REISSUE;
+
+		/* IOPOLL retry should happen for io-wq threads */
+		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
+			goto done;
+		/* no retry on NONBLOCK nor RWF_NOWAIT */
+		if (req->flags & REQ_F_NOWAIT)
+			goto done;
+		ret = 0;
+	} else if (ret == -EIOCBQUEUED) {
+		ret = IOU_ISSUE_SKIP_COMPLETE;
+		goto out_free;
+	}
+
+done:
+	/* it's faster to check here then delegate to kfree */
+	if (iovec[DATA])
+		kfree(iovec[DATA]);
+	if (iovec[PI])
+		kfree(iovec[PI]);
+	return kiocb_done(req, ret, issue_flags);
+out_free:
+	if (iovec[DATA])
+		kfree(iovec[DATA]);
+	if (iovec[PI])
+		kfree(iovec[PI]);
+	return ret;
+}
+
+int io_writev_pi(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_rw_pi_state s;
+	struct io_rw_state *s_data;
+	struct __io_rw_pi_state *s_pi;
+	struct iovec *iovec[2];
+	struct kiocb *kiocb = io_kiocb_to_kiocb(req, struct io_rw_pi);
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	ssize_t ret, ret2;
+	loff_t *ppos;
+
+	if (!req_has_async_data(req)) {
+		s_data = &s.data;
+		s_pi = &s.pi;
+		ret = io_import_iovecs_pi(WRITE, req, iovec, s_data, s_pi);
+		if (unlikely(ret < 0))
+			return ret;
+	} else {
+		struct io_async_rw_pi *arw = req->async_data;
+
+		iovec[DATA] = iovec[PI] = 0;
+		s_data = &arw->data.s;
+		s_pi = &arw->pi.s;
+		io_rw_pi_state_iter_restore(s_data, s_pi);
+	}
+	kiocb->private = &s_pi->iter;
+
+	ret = io_rw_pi_init_file(req, FMODE_WRITE);
+	if (unlikely(ret))
+		goto out_free;
+
+	req->cqe.res = iov_iter_count(&s_data->iter);
+
+	if (force_nonblock) {
+		/* If the file doesn't support async, just async punt */
+		if (unlikely(!io_file_supports_nowait(req)))
+			goto copy_iov;
+
+		kiocb->ki_flags |= IOCB_NOWAIT;
+	} else {
+		/* Ensure we clear previously set non-block flag */
+		kiocb->ki_flags &= ~IOCB_NOWAIT;
+	}
+
+	ppos = io_kiocb_update_pos(req);
+
+	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
+	if (unlikely(ret))
+		goto out_free;
+
+	kiocb->ki_flags |= IOCB_WRITE;
+
+	if (likely(req->file->f_op->write_iter))
+		ret2 = call_write_iter(req->file, kiocb, &s_data->iter);
+	else
+		ret2 = -EINVAL;
+
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		ret2 = -EAGAIN;
+	}
+
+	/*
+	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+	 * retry them without IOCB_NOWAIT.
+	 */
+	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+		ret2 = -EAGAIN;
+	/* no retry on NONBLOCK nor RWF_NOWAIT */
+	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
+		goto done;
+	if (!force_nonblock || ret2 != -EAGAIN) {
+		if (ret2 == -EIOCBQUEUED) {
+			ret = IOU_ISSUE_SKIP_COMPLETE;
+			goto out_free;
+		}
+		/* IOPOLL retry should happen for io-wq threads */
+		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
+			goto copy_iov;
+
+done:
+		ret = kiocb_done(req, ret2, issue_flags);
+	} else {
+copy_iov:
+		io_rw_pi_state_iter_restore(s_data, s_pi);
+		ret = io_setup_async_rw_pi(req, iovec, s_data, s_pi);
+		return ret ?: -EAGAIN;
+	}
+out_free:
+	/* it's reportedly faster than delegating the null check to kfree() */
+	if (iovec[DATA])
+		kfree(iovec[DATA]);
+	if (iovec[PI])
+		kfree(iovec[PI]);
+	return ret;
+}
+
diff --git a/io_uring/rw_pi.h b/io_uring/rw_pi.h
new file mode 100644
index 000000000000..f635da982484
--- /dev/null
+++ b/io_uring/rw_pi.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_RW_PI_H
+#define IOU_RW_PI_H
+
+#include "rw.h"
+
+struct __io_rw_pi_state {
+	struct iov_iter			iter;
+	struct iov_iter_state		iter_state;
+	struct iovec			fast_iov[UIO_FASTIOV_PI];
+};
+
+struct io_rw_pi_state {
+	struct io_rw_state		data;
+	struct __io_rw_pi_state		pi;
+};
+
+struct __io_async_rw_pi {
+	const struct iovec		*free_iovec;
+	struct __io_rw_pi_state		s;
+};
+
+struct io_async_rw_pi {
+	struct io_async_rw		data;
+	struct __io_async_rw_pi		pi;
+};
+
+int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_readv_pi(struct io_kiocb *req, unsigned int issue_flags);
+int io_readv_pi_prep_async(struct io_kiocb *req);
+int io_writev_pi(struct io_kiocb *req, unsigned int issue_flags);
+int io_writev_pi_prep_async(struct io_kiocb *req);
+void io_readv_writev_pi_cleanup(struct io_kiocb *req);
+#endif
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v4 3/3] block: fops: handle IOCB_USE_PI in direct IO
  2022-09-09 12:20 [PATCH v4 0/3] implement direct IO with integrity Alexander V. Buev
  2022-09-09 12:20 ` [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
  2022-09-09 12:20 ` [PATCH v4 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
@ 2022-09-09 12:20 ` Alexander V. Buev
  2 siblings, 0 replies; 7+ messages in thread
From: Alexander V. Buev @ 2022-09-09 12:20 UTC (permalink / raw)
  To: linux-block
  Cc: io-uring, Jens Axboe, Christoph Hellwig, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux,
	Alexander V. Buev

Check that the size of PI data correspond to device integrity profile
and data size.
Add PI data to device BIO.

Signed-off-by: Alexander V. Buev <[email protected]>
---
 block/fops.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/block/fops.c b/block/fops.c
index b90742595317..d89fa7d99635 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -16,6 +16,7 @@
 #include <linux/suspend.h>
 #include <linux/fs.h>
 #include <linux/module.h>
+#include <linux/blk-integrity.h>
 #include "blk.h"
 
 static inline struct inode *bdev_file_inode(struct file *file)
@@ -51,6 +52,19 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 
 #define DIO_INLINE_BIO_VECS 4
 
+static int __bio_integrity_add_iovec(struct bio *bio, struct iov_iter *pi_iter)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int pi_len = bio_integrity_bytes(bi, bio->bi_iter.bi_size >> SECTOR_SHIFT);
+	size_t iter_count = pi_iter->count-pi_len;
+	int ret;
+
+	iov_iter_truncate(pi_iter, pi_len);
+	ret = bio_integrity_add_iovec(bio, pi_iter);
+	iov_iter_reexpand(pi_iter, iter_count);
+	return ret;
+}
+
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		struct iov_iter *iter, unsigned int nr_pages)
 {
@@ -94,6 +108,15 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		bio.bi_opf |= REQ_NOWAIT;
 
+	if (iocb->ki_flags & IOCB_USE_PI) {
+		ret = __bio_integrity_add_iovec(&bio, (struct iov_iter *)iocb->private);
+		WRITE_ONCE(iocb->private, NULL);
+		if (ret) {
+			bio_release_pages(&bio, should_dirty);
+			goto out;
+		}
+	}
+
 	submit_bio_wait(&bio);
 
 	bio_release_pages(&bio, should_dirty);
@@ -178,6 +201,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
+	struct iov_iter *pi_iter = 0;
 
 	if (blkdev_dio_unaligned(bdev, pos, iter))
 		return -EINVAL;
@@ -235,6 +259,19 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		pos += bio->bi_iter.bi_size;
 
 		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
+
+		if (iocb->ki_flags & IOCB_USE_PI) {
+			if (!pi_iter)
+				pi_iter = (struct iov_iter *)iocb->private;
+			ret = __bio_integrity_add_iovec(bio, pi_iter);
+			WRITE_ONCE(iocb->private, NULL);
+			if (unlikely(ret)) {
+				bio->bi_status = BLK_STS_IOERR;
+				bio_endio(bio);
+				break;
+			}
+		}
+
 		if (!nr_pages) {
 			submit_bio(bio);
 			break;
@@ -343,6 +380,16 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		task_io_account_write(bio->bi_iter.bi_size);
 	}
 
+	if (iocb->ki_flags & IOCB_USE_PI) {
+		ret = __bio_integrity_add_iovec(bio, (struct iov_iter *)iocb->private);
+		WRITE_ONCE(iocb->private, NULL);
+		if (ret) {
+			bio->bi_status = BLK_STS_IOERR;
+			bio_endio(bio);
+			return -EIOCBQUEUED;
+		}
+	}
+
 	if (iocb->ki_flags & IOCB_HIPRI) {
 		bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
 		submit_bio(bio);
@@ -355,6 +402,31 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	return -EIOCBQUEUED;
 }
 
+static inline int
+blkdev_check_pi(struct block_device *bdev, size_t data_size, size_t pi_size)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+	unsigned int intervals;
+
+	if (unlikely(!(bi && bi->tuple_size &&
+			bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE))) {
+		pr_err("Device %d:%d is not integrity capable",
+			MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+		return -EINVAL;
+	}
+
+	intervals = bio_integrity_intervals(bi, data_size >> SECTOR_SHIFT);
+	if (unlikely(intervals * bi->tuple_size > pi_size)) {
+		pr_err("Device %d:%d integrity & data size mismatch",
+			MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+		pr_err("data=%zu integrity=%zu intervals=%u tuple=%u",
+			data_size, pi_size,
+			intervals, bi->tuple_size);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	unsigned int nr_pages;
@@ -362,6 +434,14 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (!iov_iter_count(iter))
 		return 0;
 
+	if (iocb->ki_flags & IOCB_USE_PI) {
+		struct block_device *bdev = iocb->ki_filp->private_data;
+		struct iov_iter *pi_iter = iocb->private;
+
+		if (blkdev_check_pi(bdev, iter->count, pi_iter->count))
+			return -EINVAL;
+	}
+
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
 	if (likely(nr_pages <= BIO_MAX_VECS)) {
 		if (is_sync_kiocb(iocb))
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio
  2022-09-09 12:20 ` [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
@ 2022-09-09 14:38   ` Christoph Hellwig
  2022-09-09 16:19     ` Alexander V. Buev
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2022-09-09 14:38 UTC (permalink / raw)
  To: Alexander V. Buev
  Cc: linux-block, io-uring, Jens Axboe, Christoph Hellwig,
	Martin K . Petersen, Pavel Begunkov, Chaitanya Kulkarni,
	Mikhail Malygin, linux

On Fri, Sep 09, 2022 at 03:20:38PM +0300, Alexander V. Buev wrote:
> Added functions to attach user PI iovec pages to bio and release this
> pages via bio_integrity_free.

Before I get into nitpicking on the nitty gritty details:

what is the reason for pinning down the memory for the iovecs here?
Other interfaces like the nvme passthrough code simply copy from
user assuming that the amount of metadata passed will usually be
rather small, and thus faster doing a copy.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio
  2022-09-09 14:38   ` Christoph Hellwig
@ 2022-09-09 16:19     ` Alexander V. Buev
  0 siblings, 0 replies; 7+ messages in thread
From: Alexander V. Buev @ 2022-09-09 16:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, io-uring, Jens Axboe, Martin K . Petersen,
	Pavel Begunkov, Chaitanya Kulkarni, Mikhail Malygin, linux

> «Внимание! Данное письмо от внешнего адресата!»
> 
> On Fri, Sep 09, 2022 at 03:20:38PM +0300, Alexander V. Buev wrote:
> > Added functions to attach user PI iovec pages to bio and release this
> > pages via bio_integrity_free.
> 
> Before I get into nitpicking on the nitty gritty details:
> 
> what is the reason for pinning down the memory for the iovecs here?
> Other interfaces like the nvme passthrough code simply copy from
> user assuming that the amount of metadata passed will usually be
> rather small, and thus faster doing a copy.

In short, for the universality of the solution.
From my point of view we have a data & metadata (PI) 
and process data & PI with the same method.

We also worked with large IO and PI can be greater than PAGE_SIZE.
I think that allocating & copying of data with PAGE_SIZE bytes of length (an in the feature more) 
per one IO is not good idea.
Also any block driver can register it's own integrity profile 
with tuple_size more than 8 or 16 bytes.

May be I am wrong but in the feature we can register some amount buffers
and pin them once at start. This is very same idea as "SELECT BUFFERS" technics but
for vector operations and with PI support.

For now we want to be able make IO with PI to block device
with minimal restriction in interface.

But I think you are right - on small IO it's may be faster to allocate & copy 
instead of pin pages. May be this is point for feature optimization?



-- 
Alexander Buev

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 2/3] block: io-uring: add READV_PI/WRITEV_PI operations
  2022-09-09 12:20 ` [PATCH v4 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
@ 2022-09-15 23:22   ` kernel test robot
  0 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2022-09-15 23:22 UTC (permalink / raw)
  To: Alexander V. Buev, linux-block
  Cc: kbuild-all, io-uring, Jens Axboe, Christoph Hellwig,
	Martin K . Petersen, Pavel Begunkov, Chaitanya Kulkarni,
	Mikhail Malygin, linux, Alexander V. Buev

Hi Alexander,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.0-rc5 next-20220915]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Alexander-V-Buev/implement-direct-IO-with-integrity/20220909-202433
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: parisc-randconfig-s053-20220914 (https://download.01.org/0day-ci/archive/20220916/[email protected]/config)
compiler: hppa-linux-gcc (GCC) 12.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # apt-get install sparse
        # sparse version: v0.6.4-39-gce1a6720-dirty
        # https://github.com/intel-lab-lkp/linux/commit/81de858455c5cf1e5870106f544fe1fd179fa324
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Alexander-V-Buev/implement-direct-IO-with-integrity/20220909-202433
        git checkout 81de858455c5cf1e5870106f544fe1fd179fa324
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=parisc SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

sparse warnings: (new ones prefixed by >>)
   io_uring/rw_pi.c: note: in included file (through io_uring/io_uring.h):
   io_uring/slist.h:138:29: sparse: sparse: no newline at end of file
   io_uring/rw_pi.c:248:27: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected void *private @@     got void [noderef] __user * @@
   io_uring/rw_pi.c:248:27: sparse:     expected void *private
   io_uring/rw_pi.c:248:27: sparse:     got void [noderef] __user *
   io_uring/rw_pi.c:458:43: sparse: sparse: Using plain integer as NULL pointer
   io_uring/rw_pi.c:543:43: sparse: sparse: Using plain integer as NULL pointer
>> io_uring/rw_pi.c:266:17: sparse: sparse: cast removes address space '__user' of expression
   io_uring/rw_pi.c:266:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:266:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:266:14: sparse:     got struct iovec *
   io_uring/rw_pi.c:275:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:275:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:275:14: sparse:     got struct iovec *
>> io_uring/rw_pi.c:266:17: sparse: sparse: cast removes address space '__user' of expression
   io_uring/rw_pi.c:266:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:266:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:266:14: sparse:     got struct iovec *
   io_uring/rw_pi.c:275:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:275:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:275:14: sparse:     got struct iovec *
>> io_uring/rw_pi.c:266:17: sparse: sparse: cast removes address space '__user' of expression
   io_uring/rw_pi.c:266:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:266:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:266:14: sparse:     got struct iovec *
   io_uring/rw_pi.c:275:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:275:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:275:14: sparse:     got struct iovec *
>> io_uring/rw_pi.c:266:17: sparse: sparse: cast removes address space '__user' of expression
   io_uring/rw_pi.c:266:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:266:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:266:14: sparse:     got struct iovec *
   io_uring/rw_pi.c:275:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct iovec [noderef] __user *uvec @@     got struct iovec * @@
   io_uring/rw_pi.c:275:14: sparse:     expected struct iovec [noderef] __user *uvec
   io_uring/rw_pi.c:275:14: sparse:     got struct iovec *

vim +/__user +266 io_uring/rw_pi.c

   255	
   256	
   257	static inline int
   258	io_import_iovecs_pi(int io_dir, struct io_kiocb *req, struct iovec **iovec,
   259				struct io_rw_state *s_data, struct __io_rw_pi_state *s_pi)
   260	{
   261		struct io_rw_pi *rw = io_kiocb_to_cmd(req, struct io_rw_pi);
   262		struct iovec __user *uvec;
   263		ssize_t ret;
   264	
   265		/* data */
 > 266		uvec = (struct iovec *)u64_to_user_ptr(rw->addr);
   267		iovec[DATA] = s_data->fast_iov;
   268		ret = __import_iovec(io_dir, uvec, rw->nr_segs,
   269					UIO_FASTIOV, iovec + DATA,
   270					&s_data->iter, req->ctx->compat);
   271	
   272		if (unlikely(ret <= 0))
   273			return (ret) ? ret : -EINVAL;
   274		/* pi */
   275		uvec = (struct iovec *)rw->kiocb.private;
   276		iovec[PI] = s_pi->fast_iov;
   277		ret = __import_iovec(io_dir, uvec, rw->nr_pi_segs,
   278					UIO_FASTIOV_PI, iovec + PI,
   279					&s_pi->iter, req->ctx->compat);
   280		if (unlikely(ret <= 0)) {
   281			if (iovec[DATA])
   282				kfree(iovec[DATA]);
   283			return (ret) ? ret : -EINVAL;
   284		}
   285	
   286		/* save states */
   287		io_rw_pi_state_iter_save(s_data, s_pi);
   288	
   289		return 0;
   290	}
   291	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-09-15 23:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-09-09 12:20 [PATCH v4 0/3] implement direct IO with integrity Alexander V. Buev
2022-09-09 12:20 ` [PATCH v4 1/3] block: bio-integrity: add PI iovec to bio Alexander V. Buev
2022-09-09 14:38   ` Christoph Hellwig
2022-09-09 16:19     ` Alexander V. Buev
2022-09-09 12:20 ` [PATCH v4 2/3] block: io-uring: add READV_PI/WRITEV_PI operations Alexander V. Buev
2022-09-15 23:22   ` kernel test robot
2022-09-09 12:20 ` [PATCH v4 3/3] block: fops: handle IOCB_USE_PI in direct IO Alexander V. Buev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox