* [RFC PATCH 0/4] userspace PI passthrough via io_uring
@ 2020-02-26 8:37 Bob Liu
2020-02-26 8:37 ` [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd Bob Liu
` (4 more replies)
0 siblings, 5 replies; 16+ messages in thread
From: Bob Liu @ 2020-02-26 8:37 UTC (permalink / raw)
To: linux-block
Cc: axboe, martin.petersen, linux-fsdevel, darrick.wong, io-uring,
Bob Liu
This RFC provides a rough implementation of a mechanism to allow
userspace to attach protection information (e.g. T10 DIF) data to a
disk write and to receive the information alongside a disk read.
The interface is an extension to the io_uring interface:
two new commands (IORING_OP_READV{WRITEV}_PI) are provided.
The last struct iovec in the arg list is interpreted to point to a buffer
containing the the PI data.
Patch #1 add two new commands to io_uring.
Patch #2 introduces two helper funcs in bio-integrity.
Patch #3 implement the PI passthrough in direct-io of block-dev.
(Similar extensions may add to fs/direct-io.c and fs/maps/directio.c)
Patch #4 add io_uring use space test case to liburing.
Welcome any feedbacks.
Thanks!
There was attempt before[1], but was based on AIO at that time.
[1] https://www.mail-archive.com/[email protected]/msg27537.html
Bob Liu (3):
io_uring: add IORING_OP_READ{WRITE}V_PI cmd
bio-integrity: introduce two funcs handle protect information
block_dev: support protect information passthrough
block/bio-integrity.c | 77 +++++++++++++++++++++++++++++++++++++++++++
fs/block_dev.c | 17 ++++++++++
fs/io_uring.c | 12 +++++++
include/linux/bio.h | 14 ++++++++
include/linux/fs.h | 1 +
include/uapi/linux/io_uring.h | 2 ++
6 files changed, 123 insertions(+)
--
2.9.5
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 8:37 [RFC PATCH 0/4] userspace PI passthrough via io_uring Bob Liu
@ 2020-02-26 8:37 ` Bob Liu
2020-02-26 14:24 ` Jens Axboe
2020-02-26 8:37 ` [PATCH 2/4] bio-integrity: introduce two funcs handle protect information Bob Liu
` (3 subsequent siblings)
4 siblings, 1 reply; 16+ messages in thread
From: Bob Liu @ 2020-02-26 8:37 UTC (permalink / raw)
To: linux-block
Cc: axboe, martin.petersen, linux-fsdevel, darrick.wong, io-uring,
Bob Liu
Add read and write with protect information command.
Signed-off-by: Bob Liu <[email protected]>
---
fs/io_uring.c | 12 ++++++++++++
include/linux/fs.h | 1 +
include/uapi/linux/io_uring.h | 2 ++
3 files changed, 15 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 562e3a1..c43d96a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -630,12 +630,14 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
switch (req->opcode) {
case IORING_OP_WRITEV:
+ case IORING_OP_WRITEV_PI:
case IORING_OP_WRITE_FIXED:
/* only regular files should be hashed for writes */
if (req->flags & REQ_F_ISREG)
do_hashed = true;
/* fall-through */
case IORING_OP_READV:
+ case IORING_OP_READV_PI:
case IORING_OP_READ_FIXED:
case IORING_OP_SENDMSG:
case IORING_OP_RECVMSG:
@@ -1505,6 +1507,12 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+ if (req->opcode == IORING_OP_READV_PI ||
+ req->opcode == IORING_OP_WRITEV_PI) {
+ if (!(req->file->f_flags & O_DIRECT))
+ return -EINVAL;
+ kiocb->ki_flags |= IOCB_USE_PI;
+ }
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -3065,10 +3073,12 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_NOP:
break;
case IORING_OP_READV:
+ case IORING_OP_READV_PI:
case IORING_OP_READ_FIXED:
ret = io_read_prep(req, sqe, true);
break;
case IORING_OP_WRITEV:
+ case IORING_OP_WRITEV_PI:
case IORING_OP_WRITE_FIXED:
ret = io_write_prep(req, sqe, true);
break;
@@ -3156,6 +3166,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
case IORING_OP_NOP:
ret = io_nop(req);
break;
+ case IORING_OP_READV_PI:
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
if (sqe) {
@@ -3166,6 +3177,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITEV:
+ case IORING_OP_WRITEV_PI:
case IORING_OP_WRITE_FIXED:
if (sqe) {
ret = io_write_prep(req, sqe, force_nonblock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98e0349..65fda07 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -314,6 +314,7 @@ enum rw_hint {
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
#define IOCB_NOWAIT (1 << 7)
+#define IOCB_USE_PI (1 << 8)
struct kiocb {
struct file *ki_filp;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a3300e1..98fa3f1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -62,6 +62,8 @@ enum {
IORING_OP_NOP,
IORING_OP_READV,
IORING_OP_WRITEV,
+ IORING_OP_READV_PI,
+ IORING_OP_WRITEV_PI,
IORING_OP_FSYNC,
IORING_OP_READ_FIXED,
IORING_OP_WRITE_FIXED,
--
2.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 2/4] bio-integrity: introduce two funcs handle protect information
2020-02-26 8:37 [RFC PATCH 0/4] userspace PI passthrough via io_uring Bob Liu
2020-02-26 8:37 ` [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd Bob Liu
@ 2020-02-26 8:37 ` Bob Liu
2020-02-26 16:03 ` Darrick J. Wong
2020-02-26 8:37 ` [PATCH 3/4] block_dev: support protect information passthrough Bob Liu
` (2 subsequent siblings)
4 siblings, 1 reply; 16+ messages in thread
From: Bob Liu @ 2020-02-26 8:37 UTC (permalink / raw)
To: linux-block
Cc: axboe, martin.petersen, linux-fsdevel, darrick.wong, io-uring,
Bob Liu
Introduce two funcs handle protect information passthrough from
user space.
iter_slice_protect_info() will slice the last segment as protect
information.
bio_integrity_prep_from_iovec() attach the protect information to
a bio.
Signed-off-by: Bob Liu <[email protected]>
---
block/bio-integrity.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/bio.h | 14 ++++++++++
2 files changed, 91 insertions(+)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 575df98..0b22c5d 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -12,6 +12,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
+#include <linux/uio.h>
#include "blk.h"
#define BIP_INLINE_VECS 4
@@ -305,6 +306,53 @@ bool bio_integrity_prep(struct bio *bio)
}
EXPORT_SYMBOL(bio_integrity_prep);
+int bio_integrity_prep_from_iovec(struct bio *bio, struct iovec *pi_iov)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct bio_integrity_payload *bip;
+ struct page *user_pi_page;
+ int nr_vec_page = 0;
+ int ret = 0, interval = 0;
+
+ if (!pi_iov || !pi_iov->iov_base)
+ return 1;
+
+ nr_vec_page = (pi_iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_vec_page > 1) {
+ printk("Now only support 1 page containing integrity "
+ "metadata, while requires %d pages.\n", nr_vec_page);
+ return 1;
+ }
+
+ interval = bio_integrity_intervals(bi, bio_sectors(bio));
+ if ((interval * bi->tuple_size) != pi_iov->iov_len)
+ return 1;
+
+ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_vec_page);
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
+
+ bip->bip_iter.bi_size = pi_iov->iov_len;
+ bip->bio_iter = bio->bi_iter;
+ bip_set_seed(bip, bio->bi_iter.bi_sector);
+
+ if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
+ bip->bip_flags |= BIP_IP_CHECKSUM;
+
+ ret = get_user_pages_fast((unsigned long)(pi_iov->iov_base), nr_vec_page,
+ op_is_write(bio_op(bio)) ? FOLL_WRITE : 0,
+ &user_pi_page);
+ if (unlikely(ret < 0))
+ return 1;
+
+ ret = bio_integrity_add_page(bio, user_pi_page, pi_iov->iov_len, 0);
+ if (unlikely(ret != pi_iov->iov_len))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep_from_iovec);
+
/**
* bio_integrity_verify_fn - Integrity I/O completion worker
* @work: Work struct stored in bio to be verified
@@ -378,6 +426,35 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
}
/**
+ * iter_slice_protect_info
+ *
+ * Description: slice protection information from iter.
+ * The last iovec contains protection information pass from user space.
+ */
+int iter_slice_protect_info(struct iov_iter *iter, int nr_pages,
+ struct iovec **pi_iov)
+{
+ size_t len = 0;
+
+ /* TBD: now only support one bio. */
+ if (!iter_is_iovec(iter) || nr_pages >= BIO_MAX_PAGES - 1)
+ return 1;
+
+ /* Last iovec contains protection information. */
+ iter->nr_segs--;
+ *pi_iov = (struct iovec *)(iter->iov + iter->nr_segs);
+
+ len = (*pi_iov)->iov_len;
+ if (len > 0 && len < iter->count) {
+ iter->count -= len;
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(iter_slice_protect_info);
+
+/**
* bio_integrity_trim - Trim integrity vector
* @bio: bio whose integrity vector to update
*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3cdb84c..6172b13 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -749,6 +749,8 @@ static inline bool bioset_initialized(struct bio_set *bs)
extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
extern bool bio_integrity_prep(struct bio *);
+extern int bio_integrity_prep_from_iovec(struct bio *bio, struct iovec *pi_iov);
+extern int iter_slice_protect_info(struct iov_iter *iter, int nr_pages, struct iovec **pi_iov);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *);
extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
@@ -778,6 +780,18 @@ static inline bool bio_integrity_prep(struct bio *bio)
return true;
}
+static inline int bio_integrity_prep_from_iovec(struct bio *bio,
+ struct iovec *pi_iov)
+{
+ return 0;
+}
+
+static inline int iter_slice_protect_info(struct iov_iter *iter, int nr_pages,
+ struct iovec **pi_iov)
+{
+ return 0;
+}
+
static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
gfp_t gfp_mask)
{
--
2.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 3/4] block_dev: support protect information passthrough
2020-02-26 8:37 [RFC PATCH 0/4] userspace PI passthrough via io_uring Bob Liu
2020-02-26 8:37 ` [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd Bob Liu
2020-02-26 8:37 ` [PATCH 2/4] bio-integrity: introduce two funcs handle protect information Bob Liu
@ 2020-02-26 8:37 ` Bob Liu
2020-02-26 16:04 ` Darrick J. Wong
2020-02-26 8:37 ` [PATCH 4/4] liburing/test: add testcase for " Bob Liu
2020-02-26 14:25 ` [RFC PATCH 0/4] userspace PI passthrough via io_uring Jens Axboe
4 siblings, 1 reply; 16+ messages in thread
From: Bob Liu @ 2020-02-26 8:37 UTC (permalink / raw)
To: linux-block
Cc: axboe, martin.petersen, linux-fsdevel, darrick.wong, io-uring,
Bob Liu
Support protect information passed from use sapce, on direct io
is considered now.
Signed-off-by: Bob Liu <[email protected]>
---
fs/block_dev.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69bf2fb..10e3299 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -348,6 +348,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
int ret = 0;
+ struct iovec *pi_iov;
+
+ if (iocb->ki_flags & IOCB_USE_PI) {
+ ret = iter_slice_protect_info(iter, nr_pages, &pi_iov);
+ if (ret)
+ return -EINVAL;
+ }
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -411,6 +418,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
polled = true;
}
+ /* Add protection information to bio */
+ if (iocb->ki_flags & IOCB_USE_PI) {
+ ret = bio_integrity_prep_from_iovec(bio, pi_iov);
+ if (ret) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ break;
+ }
+ }
+
qc = submit_bio(bio);
if (polled)
--
2.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 4/4] liburing/test: add testcase for protect information passthrough
2020-02-26 8:37 [RFC PATCH 0/4] userspace PI passthrough via io_uring Bob Liu
` (2 preceding siblings ...)
2020-02-26 8:37 ` [PATCH 3/4] block_dev: support protect information passthrough Bob Liu
@ 2020-02-26 8:37 ` Bob Liu
2020-02-26 14:25 ` [RFC PATCH 0/4] userspace PI passthrough via io_uring Jens Axboe
4 siblings, 0 replies; 16+ messages in thread
From: Bob Liu @ 2020-02-26 8:37 UTC (permalink / raw)
To: linux-block
Cc: axboe, martin.petersen, linux-fsdevel, darrick.wong, io-uring,
Bob Liu
Demonstrate how to use IORING_OP_READ{WRITE}V_PI cmd.
Write data together with protection information, then read back and compare
results.
Signed-off-by: Bob Liu <[email protected]>
---
src/include/liburing.h | 16 ++
src/include/liburing/io_uring.h | 2 +
test/Makefile | 4 +-
test/pi_passthrough.c | 267 ++++++++++++++++++++++++++++++++
4 files changed, 287 insertions(+), 2 deletions(-)
create mode 100644 test/pi_passthrough.c
diff --git a/src/include/liburing.h b/src/include/liburing.h
index 2f1e968..2967c5b 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -206,6 +206,14 @@ static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd,
sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
}
+static inline void io_uring_prep_readv_pi(struct io_uring_sqe *sqe, int fd,
+ const struct iovec *iovecs,
+ unsigned nr_vecs, off_t offset)
+{
+ io_uring_prep_rw(IORING_OP_READV_PI, sqe, fd, iovecs, nr_vecs, offset);
+}
+
+
static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd,
const struct iovec *iovecs,
unsigned nr_vecs, off_t offset)
@@ -228,6 +236,14 @@ static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd,
io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);
}
+static inline void io_uring_prep_writev_pi(struct io_uring_sqe *sqe, int fd,
+ const struct iovec *iovecs,
+ unsigned nr_vecs, off_t offset)
+{
+ io_uring_prep_rw(IORING_OP_WRITEV_PI, sqe, fd, iovecs, nr_vecs, offset);
+}
+
+
static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
const void *buf, unsigned nbytes,
off_t offset, int buf_index)
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 3f7961c..f5bb46f 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -86,6 +86,8 @@ enum {
IORING_OP_NOP,
IORING_OP_READV,
IORING_OP_WRITEV,
+ IORING_OP_READV_PI,
+ IORING_OP_WRITEV_PI,
IORING_OP_FSYNC,
IORING_OP_READ_FIXED,
IORING_OP_WRITE_FIXED,
diff --git a/test/Makefile b/test/Makefile
index 4a0bb4e..0bf29ec 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -13,7 +13,7 @@ all_targets += poll poll-cancel ring-leak fsync io_uring_setup io_uring_register
send_recvmsg a4c0b3decb33-test 500f9fbadef8-test timeout \
sq-space_left stdout cq-ready cq-peek-batch file-register \
cq-size 8a9973408177-test a0908ae19763-test 232c93d07b74-test \
- socket-rw accept timeout-overflow defer read-write io-cancel \
+ socket-rw accept timeout-overflow defer read-write pi_passthrough io-cancel \
link-timeout cq-overflow link_drain fc2a85cb02ef-test \
poll-link accept-link fixed-link poll-cancel-ton teardowns \
poll-many b5837bd5311d-test accept-test d77a67ed5f27-test \
@@ -40,7 +40,7 @@ test_srcs := poll.c poll-cancel.c ring-leak.c fsync.c io_uring_setup.c \
500f9fbadef8-test.c timeout.c sq-space_left.c stdout.c cq-ready.c\
cq-peek-batch.c file-register.c cq-size.c 8a9973408177-test.c \
a0908ae19763-test.c 232c93d07b74-test.c socket-rw.c accept.c \
- timeout-overflow.c defer.c read-write.c io-cancel.c link-timeout.c \
+ timeout-overflow.c defer.c read-write.c pi_passthrough.c io-cancel.c link-timeout.c \
cq-overflow.c link_drain.c fc2a85cb02ef-test.c poll-link.c \
accept-link.c fixed-link.c poll-cancel-ton.c teardowns.c poll-many.c \
b5837bd5311d-test.c accept-test.c d77a67ed5f27-test.c connect.c \
diff --git a/test/pi_passthrough.c b/test/pi_passthrough.c
new file mode 100644
index 0000000..27d679a
--- /dev/null
+++ b/test/pi_passthrough.c
@@ -0,0 +1,267 @@
+/*
+ * Simple app that demonstrates how to setup an io_uring interface,
+ * submit and complete IO against it, and then tear it down.
+ *
+ * gcc -Wall -O2 -D_GNU_SOURCE -o pi_passthrough pi_passthrough.c -luring
+ * modprobe scsi_debug.ko dif=1 guard=1 dev_size_mb=1024 num_parts=1 ato=1 dix=31
+ */
+#include <stdio.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include "liburing.h"
+
+#define NR_SECTOR 11
+#define SECTOR_SIZE 512
+#define BUF_SIZE (NR_SECTOR * SECTOR_SIZE)
+
+/* Will be round up power of 2 */
+#define QD (NR_SECTOR + 1 )
+
+struct sd_dif_tuple {
+ uint16_t guard_tag; /* Checksum */
+ uint16_t app_tag; /* Opaque storage */
+ uint32_t ref_tag; /* Target LBA or indirect LBA */
+};
+
+
+typedef unsigned short __sum16;
+static inline unsigned short from32to16(unsigned int x)
+{
+ /* add up 16-bit and 16-bit for 16+c bit */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up carry.. */
+ x = (x & 0xffff) + (x >> 16);
+ return x;
+}
+
+static unsigned int do_csum(const unsigned char *buff, int len)
+{
+ int odd, count;
+ unsigned long result = 0;
+
+ if (len <= 0)
+ goto out;
+ odd = 1 & (unsigned long) buff;
+ if (odd) {
+#ifdef __LITTLE_ENDIAN
+ result = *buff;
+#else
+ result += (*buff << 8);
+#endif
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *) buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ unsigned long carry = 0;
+ do {
+ unsigned long w = *(unsigned int *) buff;
+ count--;
+ buff += 4;
+ result += carry;
+ result += w;
+ carry = (w > result);
+ } while (count);
+ result += carry;
+ result = (result & 0xffff) + (result >> 16);
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+#ifdef __LITTLE_ENDIAN
+ result += *buff;
+#else
+ result += (*buff << 8);
+#endif
+ result = from32to16(result);
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+out:
+ return result;
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum(const void *buff, int len)
+{
+ return (__sum16)~do_csum(buff, len);
+}
+
+static void stamp_pi_buffer(struct sd_dif_tuple *t, uint16_t csum,
+ uint16_t tag, uint32_t sector)
+{
+ //t->guard_tag = htons(csum);
+ t->guard_tag = csum;
+ t->app_tag = getpid();
+ t->ref_tag = htonl(sector);
+}
+
+static void dump_buffer(char *buf, size_t len)
+{
+ size_t off;
+ char *p;
+
+ for (p = buf; p < buf + len; p++) {
+ off = p - buf;
+ if (off % 32 == 0) {
+ if (p != buf)
+ printf("\n");
+ printf("%05zu:", off);
+ }
+ printf(" %02x", *p & 0xFF);
+ }
+ printf("\n");
+}
+
+int io_rw(int fd, int write, void *buf, size_t len,
+ void *mbuf, size_t pi_buflen)
+{
+ struct io_uring ring;
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct iovec *iovecs;
+ int i, ret, pending, done, offset=0;
+
+ iovecs = calloc(QD, sizeof(struct iovec));
+ for (i = 0; i < QD - 1; i++) {
+ iovecs[i].iov_base = buf + i * SECTOR_SIZE;
+ iovecs[i].iov_len = SECTOR_SIZE;
+ }
+ /* Last iovecs store protect information */
+ iovecs[i].iov_base = mbuf;
+ iovecs[i].iov_len = pi_buflen;
+
+ if (write) {
+ __sum16 ip_csum;
+ int out_fd = 0;
+ out_fd = open("/dev/random", O_RDONLY);
+ if (out_fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ ret = read(out_fd, buf, len);
+ printf("read %d bytes from file\n", ret);
+ for( i = 0; i < NR_SECTOR; i++) {
+ ip_csum = ip_compute_csum(buf + i*SECTOR_SIZE, SECTOR_SIZE);
+ printf("ip_csum:0x%x\n", ip_csum);
+
+ stamp_pi_buffer(mbuf + i * sizeof(struct sd_dif_tuple),
+ ip_csum,0x0, i & 0xffffffff);
+ }
+ }
+
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret < 0) {
+ fprintf(stderr, "queue_init: %s\n", strerror(-ret));
+ return 1;
+ }
+
+ sqe = io_uring_get_sqe(&ring);
+ if (!sqe)
+ return 1;
+ if(write)
+ io_uring_prep_writev_pi(sqe, fd, iovecs, QD, offset);
+ else
+ io_uring_prep_readv_pi(sqe, fd, iovecs, QD, offset);
+
+ ret = io_uring_submit(&ring);
+ if (ret < 0) {
+ fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret));
+ return 1;
+ }
+
+ done = 0;
+ pending = ret;
+ for (i = 0; i < pending; i++) {
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret));
+ return 1;
+ }
+
+ done++;
+ ret = 0;
+ if (cqe->res != SECTOR_SIZE) {
+ fprintf(stderr, "ceq->res=%d, wanted %d\n",
+ cqe->res, SECTOR_SIZE);
+ ret = 1;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+ if (ret)
+ break;
+ }
+ io_uring_queue_exit(&ring);
+ printf("Submitted=%d, completed=%d\n", pending, done);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int fd;
+ int page_size = 4096;
+ size_t pi_buflen;
+ void *buf, *mbuf;
+ void *buf2, *mbuf2; //read back
+
+ if (argc < 2) {
+ printf("%s: file\n", argv[0]);
+ return 1;
+ }
+
+ fd = open(argv[1], O_RDWR |O_SYNC| O_DIRECT);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ /* write with protect information */
+ pi_buflen = NR_SECTOR * sizeof(struct sd_dif_tuple);
+ if (posix_memalign(&buf, page_size, BUF_SIZE) ||
+ posix_memalign(&mbuf, page_size, pi_buflen) ) {
+ perror("memalign");
+ return 1;
+ }
+ io_rw(fd, 1, buf, BUF_SIZE, mbuf, pi_buflen);
+
+ /* read out protect information */
+ if (posix_memalign(&buf2, page_size, BUF_SIZE) ||
+ posix_memalign(&mbuf2, page_size, pi_buflen) ) {
+ perror("memalign");
+ return 1;
+ }
+ io_rw(fd, 0, buf2, BUF_SIZE, mbuf2, pi_buflen);
+ close(fd);
+
+ /* Compare result. */
+ if(memcmp(buf, buf2, BUF_SIZE)) {
+ printf("err!! protect date mismatch!!\n");
+ dump_buffer(buf, BUF_SIZE);
+ dump_buffer(mbuf, pi_buflen);
+ }
+ if(memcmp(mbuf, mbuf2, pi_buflen)) {
+ printf("err!! protect date mismatch!!\n");
+ dump_buffer(buf2, BUF_SIZE);
+ dump_buffer(mbuf2, pi_buflen);
+ }
+ printf("test succ!!\n");
+
+ return 0;
+}
--
2.17.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 8:37 ` [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd Bob Liu
@ 2020-02-26 14:24 ` Jens Axboe
2020-02-26 15:57 ` Christoph Hellwig
2020-02-27 9:05 ` Bob Liu
0 siblings, 2 replies; 16+ messages in thread
From: Jens Axboe @ 2020-02-26 14:24 UTC (permalink / raw)
To: Bob Liu, linux-block
Cc: martin.petersen, linux-fsdevel, darrick.wong, io-uring
On 2/26/20 1:37 AM, Bob Liu wrote:
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index a3300e1..98fa3f1 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -62,6 +62,8 @@ enum {
> IORING_OP_NOP,
> IORING_OP_READV,
> IORING_OP_WRITEV,
> + IORING_OP_READV_PI,
> + IORING_OP_WRITEV_PI,
> IORING_OP_FSYNC,
> IORING_OP_READ_FIXED,
> IORING_OP_WRITE_FIXED,
So this one renumbers everything past IORING_OP_WRITEV, breaking the
ABI in a very bad way. I'm guessing that was entirely unintentional?
Any new command must go at the end of the list.
You're also not updating io_op_defs[] with the two new commands,
which means it won't compile at all. I'm guessing you tested this on
an older version of the kernel which didn't have io_op_defs[]?
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [RFC PATCH 0/4] userspace PI passthrough via io_uring
2020-02-26 8:37 [RFC PATCH 0/4] userspace PI passthrough via io_uring Bob Liu
` (3 preceding siblings ...)
2020-02-26 8:37 ` [PATCH 4/4] liburing/test: add testcase for " Bob Liu
@ 2020-02-26 14:25 ` Jens Axboe
4 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2020-02-26 14:25 UTC (permalink / raw)
To: Bob Liu, linux-block
Cc: martin.petersen, linux-fsdevel, darrick.wong, io-uring
On 2/26/20 1:37 AM, Bob Liu wrote:
> This RFC provides a rough implementation of a mechanism to allow
> userspace to attach protection information (e.g. T10 DIF) data to a
> disk write and to receive the information alongside a disk read.
> The interface is an extension to the io_uring interface:
> two new commands (IORING_OP_READV{WRITEV}_PI) are provided.
> The last struct iovec in the arg list is interpreted to point to a buffer
> containing the the PI data.
>
> Patch #1 add two new commands to io_uring.
> Patch #2 introduces two helper funcs in bio-integrity.
> Patch #3 implement the PI passthrough in direct-io of block-dev.
> (Similar extensions may add to fs/direct-io.c and fs/maps/directio.c)
> Patch #4 add io_uring use space test case to liburing.
No strong feelings on the support in general, the io_uring bits are
trivial enough (once fixed up, per comments in that patch) that I
have no objections on that front.
I'd really like Martin to render an opinion on the API (PI info in
last vec), since he's the integrity guy.
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 14:24 ` Jens Axboe
@ 2020-02-26 15:57 ` Christoph Hellwig
2020-02-26 15:58 ` Jens Axboe
2020-02-27 9:05 ` Bob Liu
1 sibling, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2020-02-26 15:57 UTC (permalink / raw)
To: Jens Axboe
Cc: Bob Liu, linux-block, martin.petersen, linux-fsdevel,
darrick.wong, io-uring
On Wed, Feb 26, 2020 at 07:24:06AM -0700, Jens Axboe wrote:
> On 2/26/20 1:37 AM, Bob Liu wrote:
> > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > index a3300e1..98fa3f1 100644
> > --- a/include/uapi/linux/io_uring.h
> > +++ b/include/uapi/linux/io_uring.h
> > @@ -62,6 +62,8 @@ enum {
> > IORING_OP_NOP,
> > IORING_OP_READV,
> > IORING_OP_WRITEV,
> > + IORING_OP_READV_PI,
> > + IORING_OP_WRITEV_PI,
> > IORING_OP_FSYNC,
> > IORING_OP_READ_FIXED,
> > IORING_OP_WRITE_FIXED,
>
> So this one renumbers everything past IORING_OP_WRITEV, breaking the
> ABI in a very bad way. I'm guessing that was entirely unintentional?
> Any new command must go at the end of the list.
>
> You're also not updating io_op_defs[] with the two new commands,
> which means it won't compile at all. I'm guessing you tested this on
> an older version of the kernel which didn't have io_op_defs[]?
And the real question is why we need ops insted of just a flag and
using previously reserved fields for the PI pointer.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 15:57 ` Christoph Hellwig
@ 2020-02-26 15:58 ` Jens Axboe
2020-02-26 16:03 ` Darrick J. Wong
2020-02-26 16:53 ` Christoph Hellwig
0 siblings, 2 replies; 16+ messages in thread
From: Jens Axboe @ 2020-02-26 15:58 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Bob Liu, linux-block, martin.petersen, linux-fsdevel,
darrick.wong, io-uring
On 2/26/20 8:57 AM, Christoph Hellwig wrote:
> On Wed, Feb 26, 2020 at 07:24:06AM -0700, Jens Axboe wrote:
>> On 2/26/20 1:37 AM, Bob Liu wrote:
>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>> index a3300e1..98fa3f1 100644
>>> --- a/include/uapi/linux/io_uring.h
>>> +++ b/include/uapi/linux/io_uring.h
>>> @@ -62,6 +62,8 @@ enum {
>>> IORING_OP_NOP,
>>> IORING_OP_READV,
>>> IORING_OP_WRITEV,
>>> + IORING_OP_READV_PI,
>>> + IORING_OP_WRITEV_PI,
>>> IORING_OP_FSYNC,
>>> IORING_OP_READ_FIXED,
>>> IORING_OP_WRITE_FIXED,
>>
>> So this one renumbers everything past IORING_OP_WRITEV, breaking the
>> ABI in a very bad way. I'm guessing that was entirely unintentional?
>> Any new command must go at the end of the list.
>>
>> You're also not updating io_op_defs[] with the two new commands,
>> which means it won't compile at all. I'm guessing you tested this on
>> an older version of the kernel which didn't have io_op_defs[]?
>
> And the real question is why we need ops insted of just a flag and
> using previously reserved fields for the PI pointer.
Yeah, should probably be a RWF_ flag instead, and a 64-bit SQE field
for the PI data. The 'last iovec is PI' is kind of icky.
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/4] bio-integrity: introduce two funcs handle protect information
2020-02-26 8:37 ` [PATCH 2/4] bio-integrity: introduce two funcs handle protect information Bob Liu
@ 2020-02-26 16:03 ` Darrick J. Wong
2020-02-27 9:23 ` Bob Liu
0 siblings, 1 reply; 16+ messages in thread
From: Darrick J. Wong @ 2020-02-26 16:03 UTC (permalink / raw)
To: Bob Liu; +Cc: linux-block, axboe, martin.petersen, linux-fsdevel, io-uring
On Wed, Feb 26, 2020 at 04:37:17PM +0800, Bob Liu wrote:
> Introduce two funcs handle protect information passthrough from
> user space.
>
> iter_slice_protect_info() will slice the last segment as protect
> information.
>
> bio_integrity_prep_from_iovec() attach the protect information to
> a bio.
>
> Signed-off-by: Bob Liu <[email protected]>
> ---
> block/bio-integrity.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/bio.h | 14 ++++++++++
> 2 files changed, 91 insertions(+)
>
> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
> index 575df98..0b22c5d 100644
> --- a/block/bio-integrity.c
> +++ b/block/bio-integrity.c
> @@ -12,6 +12,7 @@
> #include <linux/bio.h>
> #include <linux/workqueue.h>
> #include <linux/slab.h>
> +#include <linux/uio.h>
> #include "blk.h"
>
> #define BIP_INLINE_VECS 4
> @@ -305,6 +306,53 @@ bool bio_integrity_prep(struct bio *bio)
> }
> EXPORT_SYMBOL(bio_integrity_prep);
>
> +int bio_integrity_prep_from_iovec(struct bio *bio, struct iovec *pi_iov)
> +{
> + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
> + struct bio_integrity_payload *bip;
> + struct page *user_pi_page;
> + int nr_vec_page = 0;
> + int ret = 0, interval = 0;
> +
> + if (!pi_iov || !pi_iov->iov_base)
> + return 1;
> +
> + nr_vec_page = (pi_iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + if (nr_vec_page > 1) {
> + printk("Now only support 1 page containing integrity "
> + "metadata, while requires %d pages.\n", nr_vec_page);
> + return 1;
I would've thought this would be -EINVAL or something given the -ENOMEM
below...?
> + }
> +
> + interval = bio_integrity_intervals(bi, bio_sectors(bio));
> + if ((interval * bi->tuple_size) != pi_iov->iov_len)
> + return 1;
> +
> + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_vec_page);
> + if (IS_ERR(bip))
> + return PTR_ERR(bip);
> +
> + bip->bip_iter.bi_size = pi_iov->iov_len;
> + bip->bio_iter = bio->bi_iter;
> + bip_set_seed(bip, bio->bi_iter.bi_sector);
> +
> + if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
> + bip->bip_flags |= BIP_IP_CHECKSUM;
> +
> + ret = get_user_pages_fast((unsigned long)(pi_iov->iov_base), nr_vec_page,
> + op_is_write(bio_op(bio)) ? FOLL_WRITE : 0,
> + &user_pi_page);
> + if (unlikely(ret < 0))
> + return 1;
> +
> + ret = bio_integrity_add_page(bio, user_pi_page, pi_iov->iov_len, 0);
> + if (unlikely(ret != pi_iov->iov_len))
> + return -ENOMEM;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(bio_integrity_prep_from_iovec);
> +
> /**
> * bio_integrity_verify_fn - Integrity I/O completion worker
> * @work: Work struct stored in bio to be verified
> @@ -378,6 +426,35 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
> }
>
> /**
> + * iter_slice_protect_info
> + *
> + * Description: slice protection information from iter.
> + * The last iovec contains protection information pass from user space.
What do the return values here mean?
Also kinda wondering about the slice & dice of the iovec here, but
<shrug> I guess this is RFC. :)
--D
> + */
> +int iter_slice_protect_info(struct iov_iter *iter, int nr_pages,
> + struct iovec **pi_iov)
> +{
> + size_t len = 0;
> +
> + /* TBD: now only support one bio. */
> + if (!iter_is_iovec(iter) || nr_pages >= BIO_MAX_PAGES - 1)
> + return 1;
> +
> + /* Last iovec contains protection information. */
> + iter->nr_segs--;
> + *pi_iov = (struct iovec *)(iter->iov + iter->nr_segs);
> +
> + len = (*pi_iov)->iov_len;
> + if (len > 0 && len < iter->count) {
> + iter->count -= len;
> + return 0;
> + }
> +
> + return 1;
> +}
> +EXPORT_SYMBOL(iter_slice_protect_info);
> +
> +/**
> * bio_integrity_trim - Trim integrity vector
> * @bio: bio whose integrity vector to update
> *
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 3cdb84c..6172b13 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -749,6 +749,8 @@ static inline bool bioset_initialized(struct bio_set *bs)
> extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
> extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
> extern bool bio_integrity_prep(struct bio *);
> +extern int bio_integrity_prep_from_iovec(struct bio *bio, struct iovec *pi_iov);
> +extern int iter_slice_protect_info(struct iov_iter *iter, int nr_pages, struct iovec **pi_iov);
> extern void bio_integrity_advance(struct bio *, unsigned int);
> extern void bio_integrity_trim(struct bio *);
> extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
> @@ -778,6 +780,18 @@ static inline bool bio_integrity_prep(struct bio *bio)
> return true;
> }
>
> +static inline int bio_integrity_prep_from_iovec(struct bio *bio,
> + struct iovec *pi_iov)
> +{
> + return 0;
> +}
> +
> +static inline int iter_slice_protect_info(struct iov_iter *iter, int nr_pages,
> + struct iovec **pi_iov)
> +{
> + return 0;
> +}
> +
> static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
> gfp_t gfp_mask)
> {
> --
> 2.9.5
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 15:58 ` Jens Axboe
@ 2020-02-26 16:03 ` Darrick J. Wong
2020-02-26 16:53 ` Christoph Hellwig
1 sibling, 0 replies; 16+ messages in thread
From: Darrick J. Wong @ 2020-02-26 16:03 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Bob Liu, linux-block, martin.petersen,
linux-fsdevel, io-uring
On Wed, Feb 26, 2020 at 08:58:46AM -0700, Jens Axboe wrote:
> On 2/26/20 8:57 AM, Christoph Hellwig wrote:
> > On Wed, Feb 26, 2020 at 07:24:06AM -0700, Jens Axboe wrote:
> >> On 2/26/20 1:37 AM, Bob Liu wrote:
> >>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> >>> index a3300e1..98fa3f1 100644
> >>> --- a/include/uapi/linux/io_uring.h
> >>> +++ b/include/uapi/linux/io_uring.h
> >>> @@ -62,6 +62,8 @@ enum {
> >>> IORING_OP_NOP,
> >>> IORING_OP_READV,
> >>> IORING_OP_WRITEV,
> >>> + IORING_OP_READV_PI,
> >>> + IORING_OP_WRITEV_PI,
> >>> IORING_OP_FSYNC,
> >>> IORING_OP_READ_FIXED,
> >>> IORING_OP_WRITE_FIXED,
> >>
> >> So this one renumbers everything past IORING_OP_WRITEV, breaking the
> >> ABI in a very bad way. I'm guessing that was entirely unintentional?
> >> Any new command must go at the end of the list.
> >>
> >> You're also not updating io_op_defs[] with the two new commands,
> >> which means it won't compile at all. I'm guessing you tested this on
> >> an older version of the kernel which didn't have io_op_defs[]?
> >
> > And the real question is why we need ops insted of just a flag and
> > using previously reserved fields for the PI pointer.
>
> Yeah, should probably be a RWF_ flag instead, and a 64-bit SQE field
> for the PI data. The 'last iovec is PI' is kind of icky.
Heh, I was about to send in nearly the same comment. :)
--D
> --
> Jens Axboe
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/4] block_dev: support protect information passthrough
2020-02-26 8:37 ` [PATCH 3/4] block_dev: support protect information passthrough Bob Liu
@ 2020-02-26 16:04 ` Darrick J. Wong
0 siblings, 0 replies; 16+ messages in thread
From: Darrick J. Wong @ 2020-02-26 16:04 UTC (permalink / raw)
To: Bob Liu; +Cc: linux-block, axboe, martin.petersen, linux-fsdevel, io-uring
On Wed, Feb 26, 2020 at 04:37:18PM +0800, Bob Liu wrote:
> Support protect information passed from use sapce, on direct io
> is considered now.
>
> Signed-off-by: Bob Liu <[email protected]>
> ---
> fs/block_dev.c | 17 +++++++++++++++++
> 1 file changed, 17 insertions(+)
>
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 69bf2fb..10e3299 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -348,6 +348,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
> loff_t pos = iocb->ki_pos;
> blk_qc_t qc = BLK_QC_T_NONE;
> int ret = 0;
> + struct iovec *pi_iov;
> +
> + if (iocb->ki_flags & IOCB_USE_PI) {
> + ret = iter_slice_protect_info(iter, nr_pages, &pi_iov);
> + if (ret)
> + return -EINVAL;
> + }
>
> if ((pos | iov_iter_alignment(iter)) &
> (bdev_logical_block_size(bdev) - 1))
> @@ -411,6 +418,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
> polled = true;
> }
>
> + /* Add protection information to bio */
> + if (iocb->ki_flags & IOCB_USE_PI) {
> + ret = bio_integrity_prep_from_iovec(bio, pi_iov);
> + if (ret) {
> + bio->bi_status = BLK_STS_IOERR;
> + bio_endio(bio);
If you're just going to mash all the error codes into IOERR, then this
could very well become bio_io_error() ?
--D
> + break;
> + }
> + }
> +
> qc = submit_bio(bio);
>
> if (polled)
> --
> 2.9.5
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 15:58 ` Jens Axboe
2020-02-26 16:03 ` Darrick J. Wong
@ 2020-02-26 16:53 ` Christoph Hellwig
2020-02-27 9:19 ` Bob Liu
1 sibling, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2020-02-26 16:53 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Bob Liu, linux-block, martin.petersen,
linux-fsdevel, darrick.wong, io-uring
On Wed, Feb 26, 2020 at 08:58:46AM -0700, Jens Axboe wrote:
> Yeah, should probably be a RWF_ flag instead, and a 64-bit SQE field
> for the PI data. The 'last iovec is PI' is kind of icky.
Abusing an iovec (although I though of the first once when looking
into it) looks really horrible, but has two huge advantages:
- it doesn't require passing another argument all the way down
the I/O stack
- it works with all the vectored interfaces that take a flag
argument, so not just io_uring, but also preadv2/pwritev2 and aio.
And while I don't care too much about the last I think preadv2
and pwritev2 are valuable to support.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 14:24 ` Jens Axboe
2020-02-26 15:57 ` Christoph Hellwig
@ 2020-02-27 9:05 ` Bob Liu
1 sibling, 0 replies; 16+ messages in thread
From: Bob Liu @ 2020-02-27 9:05 UTC (permalink / raw)
To: Jens Axboe, linux-block
Cc: martin.petersen, linux-fsdevel, darrick.wong, io-uring
On 2/26/20 10:24 PM, Jens Axboe wrote:
> On 2/26/20 1:37 AM, Bob Liu wrote:
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index a3300e1..98fa3f1 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -62,6 +62,8 @@ enum {
>> IORING_OP_NOP,
>> IORING_OP_READV,
>> IORING_OP_WRITEV,
>> + IORING_OP_READV_PI,
>> + IORING_OP_WRITEV_PI,
>> IORING_OP_FSYNC,
>> IORING_OP_READ_FIXED,
>> IORING_OP_WRITE_FIXED,
>
> So this one renumbers everything past IORING_OP_WRITEV, breaking the
> ABI in a very bad way. I'm guessing that was entirely unintentional?
> Any new command must go at the end of the list.
>
> You're also not updating io_op_defs[] with the two new commands,
> which means it won't compile at all. I'm guessing you tested this on
> an older version of the kernel which didn't have io_op_defs[]?
>
Yep, will rebase to the latest version next time.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd
2020-02-26 16:53 ` Christoph Hellwig
@ 2020-02-27 9:19 ` Bob Liu
0 siblings, 0 replies; 16+ messages in thread
From: Bob Liu @ 2020-02-27 9:19 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe
Cc: linux-block, martin.petersen, linux-fsdevel, darrick.wong,
io-uring
On 2/27/20 12:53 AM, Christoph Hellwig wrote:
> On Wed, Feb 26, 2020 at 08:58:46AM -0700, Jens Axboe wrote:
>> Yeah, should probably be a RWF_ flag instead, and a 64-bit SQE field
>> for the PI data. The 'last iovec is PI' is kind of icky.
>
> Abusing an iovec (although I though of the first once when looking
> into it) looks really horrible, but has two huge advantages:
>
> - it doesn't require passing another argument all the way down
> the I/O stack
> - it works with all the vectored interfaces that take a flag
> argument, so not just io_uring, but also preadv2/pwritev2 and aio.
> And while I don't care too much about the last I think preadv2
> and pwritev2 are valuable to support.
>
Indeed, actually the 'last iovec is PI' idea was learned from Darrick's original
patch which support PI passthrough via aio.
https://www.mail-archive.com/[email protected]/msg27537.html
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/4] bio-integrity: introduce two funcs handle protect information
2020-02-26 16:03 ` Darrick J. Wong
@ 2020-02-27 9:23 ` Bob Liu
0 siblings, 0 replies; 16+ messages in thread
From: Bob Liu @ 2020-02-27 9:23 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-block, axboe, martin.petersen, linux-fsdevel, io-uring
On 2/27/20 12:03 AM, Darrick J. Wong wrote:
> On Wed, Feb 26, 2020 at 04:37:17PM +0800, Bob Liu wrote:
>> Introduce two funcs handle protect information passthrough from
>> user space.
>>
>> iter_slice_protect_info() will slice the last segment as protect
>> information.
>>
>> bio_integrity_prep_from_iovec() attach the protect information to
>> a bio.
>>
>> Signed-off-by: Bob Liu <[email protected]>
>> ---
>> block/bio-integrity.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> include/linux/bio.h | 14 ++++++++++
>> 2 files changed, 91 insertions(+)
>>
>> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
>> index 575df98..0b22c5d 100644
>> --- a/block/bio-integrity.c
>> +++ b/block/bio-integrity.c
>> @@ -12,6 +12,7 @@
>> #include <linux/bio.h>
>> #include <linux/workqueue.h>
>> #include <linux/slab.h>
>> +#include <linux/uio.h>
>> #include "blk.h"
>>
>> #define BIP_INLINE_VECS 4
>> @@ -305,6 +306,53 @@ bool bio_integrity_prep(struct bio *bio)
>> }
>> EXPORT_SYMBOL(bio_integrity_prep);
>>
>> +int bio_integrity_prep_from_iovec(struct bio *bio, struct iovec *pi_iov)
>> +{
>> + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
>> + struct bio_integrity_payload *bip;
>> + struct page *user_pi_page;
>> + int nr_vec_page = 0;
>> + int ret = 0, interval = 0;
>> +
>> + if (!pi_iov || !pi_iov->iov_base)
>> + return 1;
>> +
>> + nr_vec_page = (pi_iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
>> + if (nr_vec_page > 1) {
>> + printk("Now only support 1 page containing integrity "
>> + "metadata, while requires %d pages.\n", nr_vec_page);
>> + return 1;
>
> I would've thought this would be -EINVAL or something given the -ENOMEM
> below...?
>
>> + }
>> +
>> + interval = bio_integrity_intervals(bi, bio_sectors(bio));
>> + if ((interval * bi->tuple_size) != pi_iov->iov_len)
>> + return 1;
>> +
>> + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_vec_page);
>> + if (IS_ERR(bip))
>> + return PTR_ERR(bip);
>> +
>> + bip->bip_iter.bi_size = pi_iov->iov_len;
>> + bip->bio_iter = bio->bi_iter;
>> + bip_set_seed(bip, bio->bi_iter.bi_sector);
>> +
>> + if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
>> + bip->bip_flags |= BIP_IP_CHECKSUM;
>> +
>> + ret = get_user_pages_fast((unsigned long)(pi_iov->iov_base), nr_vec_page,
>> + op_is_write(bio_op(bio)) ? FOLL_WRITE : 0,
>> + &user_pi_page);
>> + if (unlikely(ret < 0))
>> + return 1;
>> +
>> + ret = bio_integrity_add_page(bio, user_pi_page, pi_iov->iov_len, 0);
>> + if (unlikely(ret != pi_iov->iov_len))
>> + return -ENOMEM;
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL(bio_integrity_prep_from_iovec);
>> +
>> /**
>> * bio_integrity_verify_fn - Integrity I/O completion worker
>> * @work: Work struct stored in bio to be verified
>> @@ -378,6 +426,35 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
>> }
>>
>> /**
>> + * iter_slice_protect_info
>> + *
>> + * Description: slice protection information from iter.
>> + * The last iovec contains protection information pass from user space.
>
> What do the return values here mean?
>
Will update.
> Also kinda wondering about the slice & dice of the iovec here, but
> <shrug> I guess this is RFC. :)
>
Hmm, I also very hesitate to put it here or lib/iov_iter.c.
> --D
>
>> + */
>> +int iter_slice_protect_info(struct iov_iter *iter, int nr_pages,
>> + struct iovec **pi_iov)
>> +{
>> + size_t len = 0;
>> +
>> + /* TBD: now only support one bio. */
>> + if (!iter_is_iovec(iter) || nr_pages >= BIO_MAX_PAGES - 1)
>> + return 1;
>> +
>> + /* Last iovec contains protection information. */
>> + iter->nr_segs--;
>> + *pi_iov = (struct iovec *)(iter->iov + iter->nr_segs);
>> +
>> + len = (*pi_iov)->iov_len;
>> + if (len > 0 && len < iter->count) {
>> + iter->count -= len;
>> + return 0;
>> + }
>> +
>> + return 1;
>> +}
>> +EXPORT_SYMBOL(iter_slice_protect_info);
>> +
>> +/**
>> * bio_integrity_trim - Trim integrity vector
>> * @bio: bio whose integrity vector to update
>> *
>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>> index 3cdb84c..6172b13 100644
>> --- a/include/linux/bio.h
>> +++ b/include/linux/bio.h
>> @@ -749,6 +749,8 @@ static inline bool bioset_initialized(struct bio_set *bs)
>> extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
>> extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
>> extern bool bio_integrity_prep(struct bio *);
>> +extern int bio_integrity_prep_from_iovec(struct bio *bio, struct iovec *pi_iov);
>> +extern int iter_slice_protect_info(struct iov_iter *iter, int nr_pages, struct iovec **pi_iov);
>> extern void bio_integrity_advance(struct bio *, unsigned int);
>> extern void bio_integrity_trim(struct bio *);
>> extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
>> @@ -778,6 +780,18 @@ static inline bool bio_integrity_prep(struct bio *bio)
>> return true;
>> }
>>
>> +static inline int bio_integrity_prep_from_iovec(struct bio *bio,
>> + struct iovec *pi_iov)
>> +{
>> + return 0;
>> +}
>> +
>> +static inline int iter_slice_protect_info(struct iov_iter *iter, int nr_pages,
>> + struct iovec **pi_iov)
>> +{
>> + return 0;
>> +}
>> +
>> static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
>> gfp_t gfp_mask)
>> {
>> --
>> 2.9.5
>>
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2020-02-27 9:25 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-02-26 8:37 [RFC PATCH 0/4] userspace PI passthrough via io_uring Bob Liu
2020-02-26 8:37 ` [PATCH 1/4] io_uring: add IORING_OP_READ{WRITE}V_PI cmd Bob Liu
2020-02-26 14:24 ` Jens Axboe
2020-02-26 15:57 ` Christoph Hellwig
2020-02-26 15:58 ` Jens Axboe
2020-02-26 16:03 ` Darrick J. Wong
2020-02-26 16:53 ` Christoph Hellwig
2020-02-27 9:19 ` Bob Liu
2020-02-27 9:05 ` Bob Liu
2020-02-26 8:37 ` [PATCH 2/4] bio-integrity: introduce two funcs handle protect information Bob Liu
2020-02-26 16:03 ` Darrick J. Wong
2020-02-27 9:23 ` Bob Liu
2020-02-26 8:37 ` [PATCH 3/4] block_dev: support protect information passthrough Bob Liu
2020-02-26 16:04 ` Darrick J. Wong
2020-02-26 8:37 ` [PATCH 4/4] liburing/test: add testcase for " Bob Liu
2020-02-26 14:25 ` [RFC PATCH 0/4] userspace PI passthrough via io_uring Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox