* [PATCHv2 1/7] blk-mq: add ops to dma map bvec
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-02 19:36 ` [PATCHv2 2/7] file: " Keith Busch
` (6 subsequent siblings)
7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
The same buffer may be used for many subsequent IO's. Instead of setting
up the mapping per-IO, provide an interface that can allow a buffer to be
premapped just once and referenced again later.
Signed-off-by: Keith Busch <[email protected]>
---
block/bdev.c | 20 ++++++++++++++++++++
include/linux/blk-mq.h | 13 +++++++++++++
include/linux/blkdev.h | 16 ++++++++++++++++
3 files changed, 49 insertions(+)
diff --git a/block/bdev.c b/block/bdev.c
index ce05175e71ce..c3d73ad86fae 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1069,3 +1069,23 @@ void sync_bdevs(bool wait)
spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
+
+#ifdef CONFIG_HAS_DMA
+void *block_dma_map(struct block_device *bdev, struct bio_vec *bvec,
+ int nr_vecs)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q->mq_ops && q->mq_ops->dma_map)
+ return q->mq_ops->dma_map(q, bvec, nr_vecs);
+ return ERR_PTR(-EINVAL);
+}
+
+void block_dma_unmap(struct block_device *bdev, void *dma_tag)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q->mq_ops && q->mq_ops->dma_unmap)
+ return q->mq_ops->dma_unmap(q, dma_tag);
+}
+#endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index effee1dc715a..e10aabb36c2c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -639,6 +639,19 @@ struct blk_mq_ops {
*/
void (*show_rq)(struct seq_file *m, struct request *rq);
#endif
+
+#ifdef CONFIG_HAS_DMA
+ /**
+ * @dma_map: Create a dma mapping. On success, returns an opaque cookie
+ * that the can be referenced by the driver in future requests.
+ */
+ void *(*dma_map)(struct request_queue *q, struct bio_vec *bvec, int nr_vecs);
+
+ /**
+ * @dma_unmap: Tear down a previously created dma mapping.
+ */
+ void (*dma_unmap)(struct request_queue *q, void *dma_tag);
+#endif
};
enum {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49dcd31e283e..efc5e805a46e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1527,4 +1527,20 @@ struct io_comp_batch {
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
+#ifdef CONFIG_HAS_DMA
+void *block_dma_map(struct block_device *bdev, struct bio_vec *bvec,
+ int nr_vecs);
+void block_dma_unmap(struct block_device *bdev, void *dma_tag);
+#else
+static inline void *block_dma_map(struct block_device *bdev,
+ struct bio_vec *bvec, int nr_vecs)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void block_dma_unmap(struct block_device *bdev, void *dma_tag)
+{
+}
+#endif
+
#endif /* _LINUX_BLKDEV_H */
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCHv2 2/7] file: add ops to dma map bvec
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
2022-08-02 19:36 ` [PATCHv2 1/7] blk-mq: add ops to dma map bvec Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-02 19:36 ` [PATCHv2 3/7] iov_iter: introduce type for preregistered dma tags Keith Busch
` (5 subsequent siblings)
7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
The same buffer may be used for many subsequent IO's. Instead of setting
up the mapping per-IO, provide an interface that can allow a buffer to
be premapped just once and referenced again later, and implement for the
block device file.
Signed-off-by: Keith Busch <[email protected]>
---
block/fops.c | 20 ++++++++++++++++++++
fs/file.c | 15 +++++++++++++++
include/linux/fs.h | 20 ++++++++++++++++++++
3 files changed, 55 insertions(+)
diff --git a/block/fops.c b/block/fops.c
index 29066ac5a2fa..db2d1e848f4b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -670,6 +670,22 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return error;
}
+#ifdef CONFIG_HAS_DMA
+void *blkdev_dma_map(struct file *filp, struct bio_vec *bvec, int nr_vecs)
+{
+ struct block_device *bdev = filp->private_data;
+
+ return block_dma_map(bdev, bvec, nr_vecs);
+}
+
+void blkdev_dma_unmap(struct file *filp, void *dma_tag)
+{
+ struct block_device *bdev = filp->private_data;
+
+ return block_dma_unmap(bdev, dma_tag);
+}
+#endif
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
@@ -686,6 +702,10 @@ const struct file_operations def_blk_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
+#ifdef CONFIG_HAS_DMA
+ .dma_map = blkdev_dma_map,
+ .dma_unmap = blkdev_dma_unmap,
+#endif
};
static __init int blkdev_init(void)
diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..767bf9d3205e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1307,3 +1307,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
return res;
}
EXPORT_SYMBOL(iterate_fd);
+
+#ifdef CONFIG_HAS_DMA
+void *file_dma_map(struct file *file, struct bio_vec *bvec, int nr_vecs)
+{
+ if (file->f_op->dma_map)
+ return file->f_op->dma_map(file, bvec, nr_vecs);
+ return ERR_PTR(-EINVAL);
+}
+
+void file_dma_unmap(struct file *file, void *dma_tag)
+{
+ if (file->f_op->dma_unmap)
+ return file->f_op->dma_unmap(file, dma_tag);
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0d99b5a166b..befd8ea5821a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1964,6 +1964,10 @@ struct dir_context {
struct iov_iter;
struct io_uring_cmd;
+#ifdef CONFIG_HAS_DMA
+struct bio_vec;
+#endif
+
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
@@ -2006,6 +2010,10 @@ struct file_operations {
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
+#ifdef CONFIG_HAS_DMA
+ void *(*dma_map)(struct file *, struct bio_vec *, int);
+ void (*dma_unmap)(struct file *, void *);
+#endif
} __randomize_layout;
struct inode_operations {
@@ -3467,4 +3475,16 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
int advice);
+#ifdef CONFIG_HAS_DMA
+void *file_dma_map(struct file *file, struct bio_vec *bvec, int nr_vecs);
+void file_dma_unmap(struct file *file, void *dma_tag);
+#else
+static inline void *file_dma_map(struct file *file, struct bio_vec *bvec,
+ int nr_vecs)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+static inline void file_dma_unmap(struct file *file, void *dma_tag) {}
+#endif
+
#endif /* _LINUX_FS_H */
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCHv2 3/7] iov_iter: introduce type for preregistered dma tags
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
2022-08-02 19:36 ` [PATCHv2 1/7] blk-mq: add ops to dma map bvec Keith Busch
2022-08-02 19:36 ` [PATCHv2 2/7] file: " Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-02 19:36 ` [PATCHv2 4/7] block: add dma tag bio type Keith Busch
` (4 subsequent siblings)
7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
Introduce a new iov_iter type representing a pre-registered DMA address
tag. The tag is an opaque cookie specific to the lower level driver that
created it, and can be referenced at any arbitrary offset.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/uio.h | 9 +++++++++
lib/iov_iter.c | 24 +++++++++++++++++++++---
2 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 34ba4a731179..a55e4b86413a 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -26,6 +26,7 @@ enum iter_type {
ITER_PIPE,
ITER_XARRAY,
ITER_DISCARD,
+ ITER_DMA_TAG,
};
struct iov_iter_state {
@@ -46,6 +47,7 @@ struct iov_iter {
const struct bio_vec *bvec;
struct xarray *xarray;
struct pipe_inode_info *pipe;
+ void *dma_tag;
};
union {
unsigned long nr_segs;
@@ -85,6 +87,11 @@ static inline bool iov_iter_is_bvec(const struct iov_iter *i)
return iov_iter_type(i) == ITER_BVEC;
}
+static inline bool iov_iter_is_dma_tag(const struct iov_iter *i)
+{
+ return iov_iter_type(i) == ITER_DMA_TAG;
+}
+
static inline bool iov_iter_is_pipe(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_PIPE;
@@ -229,6 +236,8 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
unsigned long nr_segs, size_t count);
+void iov_iter_dma_tag(struct iov_iter *i, unsigned int direction, void *dma_tag,
+ unsigned int dma_offset, size_t count);
void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe,
size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 507e732ef7cf..d370b45d7f1b 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1077,6 +1077,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
i->count -= size;
} else if (iov_iter_is_discard(i)) {
i->count -= size;
+ } else if (iov_iter_is_dma_tag(i)) {
+ i->iov_offset += size;
+ i->count -= size;
}
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -1201,6 +1204,21 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
}
EXPORT_SYMBOL(iov_iter_bvec);
+void iov_iter_dma_tag(struct iov_iter *i, unsigned int direction,
+ void *dma_tag, unsigned int dma_offset,
+ size_t count)
+{
+ WARN_ON(direction & ~(READ | WRITE));
+ *i = (struct iov_iter){
+ .iter_type = ITER_DMA_TAG,
+ .data_source = direction,
+ .dma_tag = dma_tag,
+ .iov_offset = dma_offset,
+ .count = count
+ };
+}
+EXPORT_SYMBOL(iov_iter_dma_tag);
+
void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
struct pipe_inode_info *pipe,
size_t count)
@@ -2124,8 +2142,8 @@ EXPORT_SYMBOL(import_single_range);
*/
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
- if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
- !iov_iter_is_kvec(i))
+ if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
+ !iov_iter_is_dma_tag(i)) && !iov_iter_is_kvec(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;
@@ -2141,7 +2159,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs;
- else
+ else if (!iov_iter_is_dma_tag(i))
i->iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCHv2 4/7] block: add dma tag bio type
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
` (2 preceding siblings ...)
2022-08-02 19:36 ` [PATCHv2 3/7] iov_iter: introduce type for preregistered dma tags Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-02 19:36 ` [PATCHv2 5/7] io_uring: introduce file slot release helper Keith Busch
` (3 subsequent siblings)
7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
Premapped buffers don't require a generic bio_vec since these have
already been dma mapped to the driver specific data structure. Repurpose
the bi_io_vec with the driver specific tag as they are mutually
exclusive, and provide all the setup and split helpers to support dma
tags.
In order to use this, a driver must implement the .dma_map() blk-mq op..
If the driver provides this callback, then it must be aware that any
given bio may be using a dma_tag instead of a bio_vec.
Note, this isn't working with blk_integrity.
Signed-off-by: Keith Busch <[email protected]>
---
block/bio.c | 25 ++++++++++++++++++++++++-
block/blk-merge.c | 19 +++++++++++++++++++
include/linux/bio.h | 21 ++++++++++++---------
include/linux/blk-mq.h | 11 +++++++++++
include/linux/blk_types.h | 6 +++++-
5 files changed, 71 insertions(+), 11 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index d6eb90d9b20b..3b7accae8996 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -229,7 +229,8 @@ static void bio_free(struct bio *bio)
WARN_ON_ONCE(!bs);
bio_uninit(bio);
- bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
+ if (!bio_flagged(bio, BIO_DMA_TAGGED))
+ bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
mempool_free(p - bs->front_pad, &bs->bio_pool);
}
@@ -762,6 +763,8 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
+ if (bio_flagged(bio_src, BIO_DMA_TAGGED))
+ bio_set_flag(bio, BIO_DMA_TAGGED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_iter = bio_src->bi_iter;
@@ -1151,6 +1154,21 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
+static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
+{
+ size_t size = iov_iter_count(iter);
+
+ bio->bi_vcnt = iter->nr_segs;
+ bio->bi_dma_tag = iter->dma_tag;
+ bio->bi_iter.bi_bvec_done = iter->iov_offset;
+ bio->bi_iter.bi_size = size;
+ bio->bi_opf |= REQ_NOMERGE;
+ bio_set_flag(bio, BIO_NO_PAGE_REF);
+ bio_set_flag(bio, BIO_DMA_TAGGED);
+
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+}
+
static int bio_iov_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
@@ -1287,6 +1305,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return 0;
}
+ if (iov_iter_is_dma_tag(iter)) {
+ bio_iov_dma_tag_set(bio, iter);
+ return 0;
+ }
+
do {
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ff04e9290715..d024885ad4c4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -274,6 +274,25 @@ static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
struct bvec_iter iter;
unsigned nsegs = 0, bytes = 0;
+ if (bio_flagged(bio, BIO_DMA_TAGGED)) {
+ int offset = offset_in_page(bio->bi_iter.bi_bvec_done);
+
+ nsegs = ALIGN(bio->bi_iter.bi_size + offset, PAGE_SIZE) >>
+ PAGE_SHIFT;
+ if (bio->bi_iter.bi_size > max_bytes) {
+ bytes = max_bytes;
+ nsegs = (bytes + offset) >> PAGE_SHIFT;
+ } else if (nsegs > lim->max_segments) {
+ nsegs = lim->max_segments;
+ bytes = PAGE_SIZE * nsegs - offset;
+ } else {
+ *segs = nsegs;
+ return NULL;
+ }
+
+ goto split;
+ }
+
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ca22b06700a9..649348bc03c2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -61,11 +61,17 @@ static inline bool bio_has_data(struct bio *bio)
return false;
}
+static inline bool bio_flagged(const struct bio *bio, unsigned int bit)
+{
+ return (bio->bi_flags & (1U << bit)) != 0;
+}
+
static inline bool bio_no_advance_iter(const struct bio *bio)
{
return bio_op(bio) == REQ_OP_DISCARD ||
bio_op(bio) == REQ_OP_SECURE_ERASE ||
- bio_op(bio) == REQ_OP_WRITE_ZEROES;
+ bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_flagged(bio, BIO_DMA_TAGGED);
}
static inline void *bio_data(struct bio *bio)
@@ -98,9 +104,11 @@ static inline void bio_advance_iter(const struct bio *bio,
{
iter->bi_sector += bytes >> 9;
- if (bio_no_advance_iter(bio))
+ if (bio_no_advance_iter(bio)) {
iter->bi_size -= bytes;
- else
+ if (bio_flagged(bio, BIO_DMA_TAGGED))
+ iter->bi_bvec_done += bytes;
+ } else
bvec_iter_advance(bio->bi_io_vec, iter, bytes);
/* TODO: It is reasonable to complete bio with error here. */
}
@@ -225,11 +233,6 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
atomic_set(&bio->__bi_cnt, count);
}
-static inline bool bio_flagged(struct bio *bio, unsigned int bit)
-{
- return (bio->bi_flags & (1U << bit)) != 0;
-}
-
static inline void bio_set_flag(struct bio *bio, unsigned int bit)
{
bio->bi_flags |= (1U << bit);
@@ -447,7 +450,7 @@ static inline void bio_wouldblock_error(struct bio *bio)
*/
static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
{
- if (iov_iter_is_bvec(iter))
+ if (iov_iter_is_bvec(iter) || iov_iter_is_dma_tag(iter))
return 0;
return iov_iter_npages(iter, max_segs);
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e10aabb36c2c..f5e0aa61bf85 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1141,6 +1141,17 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
void blk_dump_rq_flags(struct request *, char *);
+static inline void *blk_rq_dma_tag(struct request *rq)
+{
+ return rq->bio && bio_flagged(rq->bio, BIO_DMA_TAGGED) ?
+ rq->bio->bi_dma_tag : 0;
+}
+
+static inline size_t blk_rq_dma_offset(struct request *rq)
+{
+ return rq->bio->bi_iter.bi_bvec_done;
+}
+
#ifdef CONFIG_BLK_DEV_ZONED
static inline unsigned int blk_rq_zone_no(struct request *rq)
{
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1ef99790f6ed..ea6db439acbe 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -299,7 +299,10 @@ struct bio {
atomic_t __bi_cnt; /* pin count */
- struct bio_vec *bi_io_vec; /* the actual vec list */
+ union {
+ struct bio_vec *bi_io_vec; /* the actual vec list */
+ void *bi_dma_tag; /* driver specific tag */
+ };
struct bio_set *bi_pool;
@@ -334,6 +337,7 @@ enum {
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_DMA_TAGGED, /* Using premmaped dma buffers */
BIO_FLAG_LAST
};
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCHv2 5/7] io_uring: introduce file slot release helper
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
` (3 preceding siblings ...)
2022-08-02 19:36 ` [PATCHv2 4/7] block: add dma tag bio type Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-02 19:36 ` [PATCHv2 6/7] io_uring: add support for dma pre-mapping Keith Busch
` (2 subsequent siblings)
7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
Releasing the pre-registered file follows a repeated pattern. Introduce
a helper to make it easier to add more complexity to this resource in
the future.
Signed-off-by: Keith Busch <[email protected]>
---
io_uring/filetable.c | 33 +++++++++++++++++++++------------
io_uring/filetable.h | 3 +++
io_uring/rsrc.c | 5 +----
3 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 7b473259f3f4..1b8db1918678 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -76,19 +76,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) {
- struct file *old_file;
-
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
- old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- ctx->rsrc_node, old_file);
+ ret = io_file_slot_queue_removal(ctx, file_slot);
if (ret)
goto err;
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
@@ -148,7 +142,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{
struct io_fixed_file *file_slot;
- struct file *file;
int ret;
if (unlikely(!ctx->file_data))
@@ -164,13 +157,10 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
if (!file_slot->file_ptr)
return -EBADF;
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+ ret = io_file_slot_queue_removal(ctx, file_slot);
if (ret)
return ret;
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
return 0;
}
@@ -191,3 +181,22 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx,
io_file_table_set_alloc_range(ctx, range.off, range.len);
return 0;
}
+
+int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
+ struct io_fixed_file *file_slot)
+{
+ u32 slot_index = file_slot - ctx->file_table.files;
+ struct file *file;
+ int ret;
+
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+ ctx->rsrc_node, file);
+ if (ret)
+ return ret;
+
+ file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, slot_index);
+
+ return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index ff3a712e11bf..e52ecf359199 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -34,6 +34,9 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg);
+int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
+ struct io_fixed_file *file_slot);
+
unsigned int io_file_get_flags(struct file *file);
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..1f10eecad4d7 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -469,12 +469,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
file_slot = io_fixed_file_slot(&ctx->file_table, i);
if (file_slot->file_ptr) {
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
+ err = io_file_slot_queue_removal(ctx, file_slot);
if (err)
break;
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, i);
needs_switch = true;
}
if (fd != -1) {
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCHv2 6/7] io_uring: add support for dma pre-mapping
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
` (4 preceding siblings ...)
2022-08-02 19:36 ` [PATCHv2 5/7] io_uring: introduce file slot release helper Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-02 23:25 ` Ammar Faizi
2022-08-02 19:36 ` [PATCHv2 7/7] nvme-pci: implement dma_map support Keith Busch
2022-08-03 20:52 ` [PATCHv2 0/7] dma mapping optimisations Jens Axboe
7 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
Provide a new register operation that can request to pre-map a known
bvec to the requested fixed file's specific implementation. If
successful, io_uring will use the returned dma tag for future fixed
buffer requests to the same file.
Signed-off-by: Keith Busch <[email protected]>
---
include/linux/io_uring_types.h | 2 +
include/uapi/linux/io_uring.h | 12 +++
io_uring/filetable.c | 7 +-
io_uring/filetable.h | 7 +-
io_uring/io_uring.c | 137 +++++++++++++++++++++++++++++++++
io_uring/net.c | 2 +-
io_uring/rsrc.c | 21 +++--
io_uring/rsrc.h | 10 ++-
io_uring/rw.c | 2 +-
9 files changed, 185 insertions(+), 15 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..f62ea17cc480 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -23,6 +23,8 @@ struct io_wq_work {
int cancel_seq;
};
+struct io_mapped_ubuf;
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
IORING_REGISTER_NOTIFIERS = 26,
IORING_UNREGISTER_NOTIFIERS = 27,
+ /* dma map registered buffers */
+ IORING_REGISTER_MAP_BUFFERS = 28,
+ IORING_REGISTER_UNMAP_BUFFERS = 29,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
__u32 flags;
};
+struct io_uring_map_buffers {
+ __s32 fd;
+ __s32 buf_start;
+ __s32 buf_end;
+ __u32 flags;
+ __u64 rsvd[2];
+};
+
#endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1b8db1918678..5ca2f27f317f 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -189,9 +189,10 @@ int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
struct file *file;
int ret;
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- ctx->rsrc_node, file);
+ file = io_file_from_fixed(file_slot);
+ io_dma_unmap_file(ctx, file_slot);
+ ret = io_queue_rsrc_removal(ctx->file_data, slot_index, ctx->rsrc_node,
+ file);
if (ret)
return ret;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index e52ecf359199..3b2aae5bff76 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -58,12 +58,17 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
return &table->files[i];
}
+static inline struct file *io_file_from_fixed(struct io_fixed_file *f)
+{
+ return (struct file *) (f->file_ptr & FFS_MASK);
+}
+
static inline struct file *io_file_from_index(struct io_file_table *table,
int index)
{
struct io_fixed_file *slot = io_fixed_file_slot(table, index);
- return (struct file *) (slot->file_ptr & FFS_MASK);
+ return io_file_from_fixed(slot);
}
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b54218da075c..f5be488eaf21 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3681,6 +3681,131 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
return ret;
}
+static int get_map_range(struct io_ring_ctx *ctx,
+ struct io_uring_map_buffers *map, void __user *arg)
+{
+ int ret;
+
+ if (copy_from_user(map, arg, sizeof(*map)))
+ return -EFAULT;
+ if (map->flags || map->rsvd[0] || map->rsvd[1])
+ return -EINVAL;
+ if (map->fd >= ctx->nr_user_files)
+ return -EINVAL;
+ if (map->buf_start < 0)
+ return -EINVAL;
+ if (map->buf_start >= ctx->nr_user_bufs)
+ return -EINVAL;
+ if (map->buf_end > ctx->nr_user_bufs)
+ map->buf_end = ctx->nr_user_bufs;
+
+ ret = map->buf_end - map->buf_start;
+ if (ret <= 0)
+ return -EINVAL;
+
+ return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+ struct file *file;
+
+ if (!imu->dma_tag)
+ return;
+
+ file = io_file_from_fixed(imu->dma_file);
+ file_dma_unmap(file, imu->dma_tag);
+ imu->dma_file = NULL;
+ imu->dma_tag = NULL;
+}
+
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot)
+{
+ int i;
+
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ if (!imu->dma_tag)
+ continue;
+
+ if (imu->dma_file == file_slot)
+ io_dma_unmap(imu);
+ }
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ int i, ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+
+ return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ struct io_fixed_file *file_slot;
+ struct file *file;
+ int ret, i;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ file_slot = io_fixed_file_slot(&ctx->file_table,
+ array_index_nospec(map.fd, ctx->nr_user_files));
+ if (!file_slot || !file_slot->file_ptr)
+ return -EBADF;
+
+ file = io_file_from_fixed(file_slot);
+ if (!(file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+ void *tag;
+
+ if (imu->dma_tag) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ tag = file_dma_map(file, imu->bvec, imu->nr_bvecs);
+ if (IS_ERR(tag)) {
+ ret = PTR_ERR(tag);
+ goto err;
+ }
+
+ imu->dma_tag = tag;
+ imu->dma_file = file_slot;
+ }
+
+ return 0;
+err:
+ while (--i >= map.buf_start) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+ return ret;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -3847,6 +3972,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_notif_unregister(ctx);
break;
+ case IORING_REGISTER_MAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_map_buffers(ctx, arg);
+ break;
+ case IORING_REGISTER_UNMAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_unmap_buffers(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 32fc3da04e41..2793fd7d99d5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
- (u64)(uintptr_t)zc->buf, zc->len);
+ (u64)(uintptr_t)zc->buf, zc->len, NULL);
if (unlikely(ret))
return ret;
} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1f10eecad4d7..ee5e5284203d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
+ io_dma_unmap(imu);
kvfree(imu);
}
*slot = NULL;
@@ -809,12 +810,16 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
int i;
for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file = io_file_from_index(&ctx->file_table, i);
+ struct io_fixed_file *f = io_fixed_file_slot(&ctx->file_table, i);
+ struct file *file;
- if (!file)
+ if (!f)
continue;
- if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+ if (f->file_ptr & FFS_SCM)
continue;
+
+ io_dma_unmap_file(ctx, f);
+ file = io_file_from_fixed(f);
io_file_bitmap_clear(&ctx->file_table, i);
fput(file);
}
@@ -1282,6 +1287,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ imu->dma_tag = NULL;
*pimu = imu;
ret = 0;
done:
@@ -1356,9 +1362,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file)
{
u64 buf_end;
size_t offset;
@@ -1376,6 +1381,10 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
+ if (imu->dma_tag && file == io_file_from_fixed(imu->dma_file)) {
+ iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, len);
+ return 0;
+ }
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..47a2942aa537 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,8 @@ struct io_mapped_ubuf {
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
+ void *dma_tag;
+ struct io_fixed_file *dma_file;
struct bio_vec bvec[];
};
@@ -64,9 +66,11 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file);
+
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot);
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+ ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
if (ret)
return ERR_PTR(ret);
return NULL;
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCHv2 6/7] io_uring: add support for dma pre-mapping
2022-08-02 19:36 ` [PATCHv2 6/7] io_uring: add support for dma pre-mapping Keith Busch
@ 2022-08-02 23:25 ` Ammar Faizi
0 siblings, 0 replies; 12+ messages in thread
From: Ammar Faizi @ 2022-08-02 23:25 UTC (permalink / raw)
To: Keith Busch
Cc: Facebook Kernel Team, Jens Axboe, Christoph Hellwig, Al Viro,
Keith Busch, Fernanda Ma'rouf, io-uring Mailing List,
Linux Block Mailing List, Linux fsdevel Mailing List,
Linux NVME Mailing List
On 8/3/22 2:36 AM, Keith Busch wrote:
> From: Keith Busch <[email protected]>
>
> Provide a new register operation that can request to pre-map a known
> bvec to the requested fixed file's specific implementation. If
> successful, io_uring will use the returned dma tag for future fixed
> buffer requests to the same file.
>
> Signed-off-by: Keith Busch <[email protected]>
[...]
> +static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
> +{
> + struct io_uring_map_buffers map;
> + struct io_fixed_file *file_slot;
> + struct file *file;
> + int ret, i;
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + ret = get_map_range(ctx, &map, arg);
> + if (ret < 0)
> + return ret;
> +
> + file_slot = io_fixed_file_slot(&ctx->file_table,
> + array_index_nospec(map.fd, ctx->nr_user_files));
> + if (!file_slot || !file_slot->file_ptr)
> + return -EBADF;
The @file_slot NULL-check doesn't make sense. The definition of
io_fixed_file_slot() is:
static inline struct io_fixed_file *
io_fixed_file_slot(struct io_file_table *table, unsigned i)
{
return &table->files[i];
}
which takes the address of an element in the array. So @file_slot
should never be NULL, if it ever be, something has gone wrong.
If you ever had @ctx->file_table.files being NULL in this path, you
should NULL-check the @->files itself, *not* the return value of
io_fixed_file_slot().
IOW:
...
// NULL check here.
if (!ctx->file_table.files)
return -EBADF;
file_slot = io_fixed_file_slot(&ctx->file_table,
array_index_nospec(map.fd, ctx->nr_user_files));
if (!file_slot->file_ptr)
return -EBADF;
...
> for (i = 0; i < ctx->nr_user_files; i++) {
> - struct file *file = io_file_from_index(&ctx->file_table, i);
> + struct io_fixed_file *f = io_fixed_file_slot(&ctx->file_table, i);
> + struct file *file;
>
> - if (!file)
> + if (!f)
> continue;
The same thing, this @f NULL-check is not needed.
> - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
> + if (f->file_ptr & FFS_SCM)
> continue;
> +
> + io_dma_unmap_file(ctx, f);
> + file = io_file_from_fixed(f);
> io_file_bitmap_clear(&ctx->file_table, i);
> fput(file);
> }
--
Ammar Faizi
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCHv2 7/7] nvme-pci: implement dma_map support
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
` (5 preceding siblings ...)
2022-08-02 19:36 ` [PATCHv2 6/7] io_uring: add support for dma pre-mapping Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
2022-08-03 20:52 ` [PATCHv2 0/7] dma mapping optimisations Jens Axboe
7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
To: linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <[email protected]>
Implement callbacks to convert a registered bio_vec to a prp list, and
use this for each IO that uses the returned tag. This saves repeated IO
conversions and dma mapping/unmapping. In many cases, the driver can
skip per-IO pool allocations entirely, saving potentially signficant CPU
cycles.
Signed-off-by: Keith Busch <[email protected]>
---
drivers/nvme/host/pci.c | 302 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 292 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 644664098ae7..2df2b9bde7d7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -110,6 +110,14 @@ struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+struct nvme_dma_mapping {
+ int nr_pages;
+ u16 offset;
+ u8 rsvd[2];
+ dma_addr_t prp_dma_addr;
+ __le64 *prps;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -544,6 +552,34 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
return true;
}
+static void nvme_sync_dma(struct nvme_dev *dev, struct request *req,
+ struct nvme_dma_mapping *mapping)
+{
+ int offset, i, j, length, nprps;
+ bool needs_sync;
+
+ offset = blk_rq_dma_offset(req) + mapping->offset;
+ i = offset >> NVME_CTRL_PAGE_SHIFT;
+ needs_sync = rq_data_dir(req) == READ &&
+ dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
+
+ if (!needs_sync)
+ return;
+
+ offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+ length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+
+ dma_sync_single_for_cpu(dev->dev,
+ le64_to_cpu(mapping->prps[i++]),
+ NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE);
+ for (j = 1; j < nprps; j++) {
+ dma_sync_single_for_cpu(dev->dev,
+ le64_to_cpu(mapping->prps[i++]),
+ NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE);
+ }
+}
+
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
@@ -576,10 +612,24 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
}
}
+static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req,
+ struct nvme_iod *iod)
+{
+ if (iod->npages == 0)
+ dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+ iod->first_dma);
+ else if (iod->use_sgl)
+ nvme_free_sgls(dev, req);
+ else
+ nvme_free_prps(dev, req);
+ mempool_free(iod->sg, dev->iod_mempool);
+}
+
static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ WARN_ON_ONCE(!iod->nents);
if (is_pci_p2pdma_page(sg_page(iod->sg)))
pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
rq_dma_dir(req));
@@ -589,25 +639,24 @@ static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
+ struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ if (mapping) {
+ nvme_sync_dma(dev, req, mapping);
+ if (iod->npages >= 0)
+ nvme_free_prp_chain(dev, req, iod);
+ return;
+ }
+
if (iod->dma_len) {
dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
rq_dma_dir(req));
return;
}
- WARN_ON_ONCE(!iod->nents);
-
nvme_unmap_sg(dev, req);
- if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
- iod->first_dma);
- else if (iod->use_sgl)
- nvme_free_sgls(dev, req);
- else
- nvme_free_prps(dev, req);
- mempool_free(iod->sg, dev->iod_mempool);
+ nvme_free_prp_chain(dev, req, iod);
}
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
@@ -835,13 +884,136 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
return BLK_STS_OK;
}
+static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
+ struct nvme_rw_command *cmnd,
+ struct nvme_iod *iod,
+ struct nvme_dma_mapping *mapping)
+{
+ static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
+ dma_addr_t prp_list_start, prp_list_end, prp_dma;
+ int i, offset, j, length, nprps, nprps_left;
+ struct dma_pool *pool;
+ __le64 *prp_list;
+ bool needs_sync;
+ void **list;
+
+ offset = blk_rq_dma_offset(req) + mapping->offset;
+ i = offset >> NVME_CTRL_PAGE_SHIFT;
+ offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+ needs_sync = rq_data_dir(req) == WRITE &&
+ dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
+
+ if (needs_sync) {
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[i]),
+ NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
+ }
+
+ length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+ cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[i++]) + offset);
+
+ if (length <= 0)
+ return BLK_STS_OK;
+
+ if (length <= NVME_CTRL_PAGE_SIZE) {
+ if (needs_sync)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[i]),
+ NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+ cmnd->dptr.prp2 = mapping->prps[i];
+ return BLK_STS_OK;
+ }
+
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+ prp_list_start = mapping->prp_dma_addr + 8 * i;
+ prp_list_end = prp_list_start + 8 * nprps;
+
+ /* Optimization when remaining list fits in one nvme page */
+ if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) ==
+ (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) {
+ cmnd->dptr.prp2 = cpu_to_le64(prp_list_start);
+ goto sync;
+ }
+
+ iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+ if (!iod->sg)
+ return BLK_STS_RESOURCE;
+
+ if (nprps <= (256 / 8)) {
+ pool = dev->prp_small_pool;
+ iod->npages = 0;
+ } else {
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
+ }
+
+ prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+ if (!prp_list) {
+ iod->npages = -1;
+ goto out_free_sg;
+ }
+
+ list = nvme_pci_iod_list(req);
+ list[0] = prp_list;
+ iod->first_dma = prp_dma;
+
+ for (;;) {
+ dma_addr_t next_prp_dma;
+ __le64 *next_prp_list;
+
+ if (nprps_left <= last_prp + 1) {
+ memcpy(prp_list, &mapping->prps[i], nprps_left * 8);
+ break;
+ }
+
+ memcpy(prp_list, &mapping->prps[i],
+ NVME_CTRL_PAGE_SIZE - 8);
+ nprps_left -= last_prp;
+ i += last_prp;
+
+ next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma);
+ if (!next_prp_list)
+ goto free_prps;
+
+ prp_list[last_prp] = cpu_to_le64(next_prp_dma);
+ prp_list = next_prp_list;
+ prp_dma = next_prp_dma;
+ list[iod->npages++] = prp_list;
+ }
+ cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+
+sync:
+ if (!needs_sync)
+ return BLK_STS_OK;
+
+ i = offset >> NVME_CTRL_PAGE_SHIFT;
+ for (j = 0; j < nprps; j++)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[i++]),
+ NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+ return BLK_STS_OK;
+
+free_prps:
+ nvme_free_prps(dev, req);
+out_free_sg:
+ mempool_free(iod->sg, dev->iod_mempool);
+ return BLK_STS_RESOURCE;
+}
+
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
+ struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int nr_mapped;
+ if (mapping) {
+ iod->dma_len = 0;
+ iod->use_sgl = false;
+ return nvme_premapped(dev, req, &cmnd->rw, iod, mapping);
+ }
+
if (blk_rq_nr_phys_segments(req) == 1) {
struct bio_vec bv = req_bvec(req);
@@ -1732,6 +1904,112 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
return result;
}
+#ifdef CONFIG_HAS_DMA
+/*
+ * Important: bvec must be describing a virtually contiguous buffer.
+ */
+static void *nvme_pci_dma_map(struct request_queue *q,
+ struct bio_vec *bvec, int nr_vecs)
+{
+ const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+ struct nvme_dma_mapping *mapping;
+ int i, j, k, size, ppv, ret = -ENOMEM;
+
+ if (!nr_vecs)
+ return ERR_PTR(-EINVAL);
+
+ mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ mapping->nr_pages = nr_vecs * nvme_pages;
+ size = sizeof(*mapping->prps) * mapping->nr_pages;
+ mapping->prps = dma_alloc_coherent(dev->dev, size,
+ &mapping->prp_dma_addr, GFP_KERNEL);
+ if (!mapping->prps)
+ goto free_mapping;
+
+ for (i = 0, k = 0; i < nr_vecs; i++) {
+ struct bio_vec *bv = bvec + i;
+ dma_addr_t dma_addr;
+
+ ppv = nvme_pages;
+ if (i == 0) {
+ mapping->offset = bv->bv_offset;
+ ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ } else if (bv->bv_offset) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (bv->bv_offset + bv->bv_len != PAGE_SIZE &&
+ i < nr_vecs - 1) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dma_addr = dma_map_bvec(dev->dev, bv, 0, 0);
+ if (dma_mapping_error(dev->dev, dma_addr)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (i == 0)
+ dma_addr -= mapping->offset;
+
+ for (j = 0; j < ppv; j++)
+ mapping->prps[k++] = cpu_to_le64(dma_addr +
+ j * NVME_CTRL_PAGE_SIZE);
+ }
+
+ get_device(dev->dev);
+ return mapping;
+
+err:
+ for (i = 0; i < k; i += ppv) {
+ __u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+ ppv = nvme_pages;
+
+ if (i == 0)
+ ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ dma_unmap_page(dev->dev, dma_addr,
+ PAGE_SIZE - offset_in_page(dma_addr), 0);
+ }
+
+ dma_free_coherent(dev->dev, size, (void *)mapping->prps,
+ mapping->prp_dma_addr);
+free_mapping:
+ kfree(mapping);
+ return ERR_PTR(ret);
+}
+
+static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag)
+{
+ const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+ struct nvme_dma_mapping *mapping = dma_tag;
+ int i, ppv;
+
+ for (i = 0; i < mapping->nr_pages; i += nvme_pages) {
+ __u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+ ppv = nvme_pages;
+
+ if (i == 0)
+ ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ dma_unmap_page(dev->dev, dma_addr,
+ PAGE_SIZE - offset_in_page(dma_addr), 0);
+ }
+
+ dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps),
+ (void *)mapping->prps, mapping->prp_dma_addr);
+ kfree(mapping);
+ put_device(dev->dev);
+}
+#endif
+
static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
@@ -1750,6 +2028,10 @@ static const struct blk_mq_ops nvme_mq_ops = {
.map_queues = nvme_pci_map_queues,
.timeout = nvme_timeout,
.poll = nvme_poll,
+#ifdef CONFIG_HAS_DMA
+ .dma_map = nvme_pci_dma_map,
+ .dma_unmap = nvme_pci_dma_unmap,
+#endif
};
static void nvme_dev_remove_admin(struct nvme_dev *dev)
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCHv2 0/7] dma mapping optimisations
2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
` (6 preceding siblings ...)
2022-08-02 19:36 ` [PATCHv2 7/7] nvme-pci: implement dma_map support Keith Busch
@ 2022-08-03 20:52 ` Jens Axboe
2022-08-04 16:28 ` Keith Busch
7 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-08-03 20:52 UTC (permalink / raw)
To: Keith Busch, linux-nvme, linux-block, io-uring, linux-fsdevel
Cc: hch, Alexander Viro, Kernel Team, Keith Busch
On 8/2/22 1:36 PM, Keith Busch wrote:
> device undergoes various represenations for every IO. Each consumes
> memory and CPU cycles. When the backing storage is NVMe, the sequence
> looks something like the following:
>
> __user void *
> struct iov_iter
> struct pages[]
> struct bio_vec[]
> struct scatterlist[]
> __le64[]
>
> Applications will often use the same buffer for many IO, though, so
> these potentially costly per-IO transformations to reach the exact same
> hardware descriptor can be skipped.
>
> The io_uring interface already provides a way for users to register
> buffers to get to the 'struct bio_vec[]'. That still leaves the
> scatterlist needed for the repeated dma_map_sg(), then transform to
> nvme's PRP list format.
>
> This series takes the registered buffers a step further. A block driver
> can implement a new .dma_map() callback to complete the representation
> to the hardware's DMA mapped address, and return a cookie so a user can
> reference it later for any given IO. When used, the block stack can skip
> significant amounts of code, improving CPU utilization, and, if not
> bandwidth limited, IOPs.
>
> The implementation is currently limited to mapping a registered buffer
> to a single file.
I ran this on my test box to see how we'd do. First the bad news:
smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
and using t/io_uring (with registered buffers, polled, etc) and a 512b
block size I get:
IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
and adding -D1 I get:
IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
which does regress that workload. Since we avoid more expensive setup at
higher block sizes, I tested that too. Here's using 128k IOs with -D0:
OPS=972.18K, BW=121.52GiB/s, IOS/call=31/31
IOPS=988.79K, BW=123.60GiB/s, IOS/call=31/31
IOPS=990.40K, BW=123.80GiB/s, IOS/call=31/31
IOPS=987.80K, BW=123.48GiB/s, IOS/call=31/31
IOPS=988.12K, BW=123.52GiB/s, IOS/call=31/31
and here with -D1:
IOPS=978.36K, BW=122.30GiB/s, IOS/call=31/31
IOPS=996.75K, BW=124.59GiB/s, IOS/call=31/31
IOPS=996.55K, BW=124.57GiB/s, IOS/call=31/31
IOPS=996.52K, BW=124.56GiB/s, IOS/call=31/31
IOPS=996.54K, BW=124.57GiB/s, IOS/call=31/31
IOPS=996.51K, BW=124.56GiB/s, IOS/call=31/31
which is a notable improvement. Then I checked CPU utilization,
switching to IRQ driven IO instead. And the good news there for bs=128K
we end up using half the CPU to achieve better performance. So definite
win there!
Just a quick dump on some quick result, I didn't look further into this
just yet.
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCHv2 0/7] dma mapping optimisations
2022-08-03 20:52 ` [PATCHv2 0/7] dma mapping optimisations Jens Axboe
@ 2022-08-04 16:28 ` Keith Busch
2022-08-04 16:42 ` Jens Axboe
0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2022-08-04 16:28 UTC (permalink / raw)
To: Jens Axboe
Cc: Keith Busch, linux-nvme, linux-block, io-uring, linux-fsdevel,
hch, Alexander Viro, Kernel Team
On Wed, Aug 03, 2022 at 02:52:11PM -0600, Jens Axboe wrote:
> I ran this on my test box to see how we'd do. First the bad news:
> smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
> and using t/io_uring (with registered buffers, polled, etc) and a 512b
> block size I get:
>
> IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
> IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
> IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
> IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
> IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
> IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
>
> and adding -D1 I get:
>
> IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
> IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
> IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
> IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
> IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
> IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
>
> which does regress that workload.
Bummer, I would expect -D1 to be no worse. My test isn't nearly as consistent
as yours, so I'm having some trouble measuring. I'm only coming with a few
micro-optimizations that might help. A diff is below on top of this series. I
also created a branch with everything folded in here:
git://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git io_uring/dma-register
https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/log/?h=io_uring/dma-register
-- >8 --
diff --git a/block/bio.c b/block/bio.c
index 3b7accae8996..c1e97dff5e40 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1154,7 +1154,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
{
size_t size = iov_iter_count(iter);
@@ -1165,8 +1165,6 @@ static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
bio->bi_opf |= REQ_NOMERGE;
bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_DMA_TAGGED);
-
- iov_iter_advance(iter, bio->bi_iter.bi_size);
}
static int bio_iov_add_page(struct bio *bio, struct page *page,
@@ -1307,6 +1305,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (iov_iter_is_dma_tag(iter)) {
bio_iov_dma_tag_set(bio, iter);
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
}
diff --git a/block/fops.c b/block/fops.c
index db2d1e848f4b..1b3649c7eb17 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -325,7 +325,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
* bio_iov_iter_get_pages() and set the bvec directly.
*/
bio_iov_bvec_set(bio, iter);
- } else {
+ } else if (iov_iter_is_dma_tag(iter)) {
+ bio_iov_dma_tag_set(bio, iter);
+ }else {
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
bio_put(bio);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index dbf73ab0877e..511cae2b7ce9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -113,7 +113,8 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
struct nvme_dma_mapping {
int nr_pages;
u16 offset;
- u8 rsvd[2];
+ bool needs_sync;
+ u8 rsvd;
dma_addr_t prp_dma_addr;
__le64 *prps;
};
@@ -556,16 +557,9 @@ static void nvme_sync_dma(struct nvme_dev *dev, struct request *req,
struct nvme_dma_mapping *mapping)
{
int offset, i, j, length, nprps;
- bool needs_sync;
offset = blk_rq_dma_offset(req) + mapping->offset;
i = offset >> NVME_CTRL_PAGE_SHIFT;
- needs_sync = rq_data_dir(req) == READ &&
- dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
-
- if (!needs_sync)
- return;
-
offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
@@ -643,7 +637,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if (mapping) {
- nvme_sync_dma(dev, req, mapping);
+ if (mapping->needs_sync)
+ nvme_sync_dma(dev, req, mapping);
if (iod->npages >= 0)
nvme_free_prp_chain(dev, req, iod);
return;
@@ -894,16 +889,13 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
int i, offset, j, length, nprps, nprps_left;
struct dma_pool *pool;
__le64 *prp_list;
- bool needs_sync;
void **list;
offset = blk_rq_dma_offset(req) + mapping->offset;
i = offset >> NVME_CTRL_PAGE_SHIFT;
offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
- needs_sync = rq_data_dir(req) == WRITE &&
- dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
- if (needs_sync) {
+ if (mapping->needs_sync) {
dma_sync_single_for_device(dev->dev,
le64_to_cpu(mapping->prps[i]),
NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
@@ -916,7 +908,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
return BLK_STS_OK;
if (length <= NVME_CTRL_PAGE_SIZE) {
- if (needs_sync)
+ if (mapping->needs_sync)
dma_sync_single_for_device(dev->dev,
le64_to_cpu(mapping->prps[i]),
NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
@@ -983,7 +975,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
sync:
- if (!needs_sync)
+ if (!mapping->needs_sync)
return BLK_STS_OK;
i = offset >> NVME_CTRL_PAGE_SHIFT;
@@ -1931,6 +1923,7 @@ static void *nvme_pci_dma_map(struct request_queue *q,
if (!mapping->prps)
goto free_mapping;
+ mapping->needs_sync = false;
for (i = 0, k = 0; i < nr_vecs; i++) {
struct bio_vec *bv = bvec + i;
dma_addr_t dma_addr;
@@ -1959,6 +1952,9 @@ static void *nvme_pci_dma_map(struct request_queue *q,
if (i == 0)
dma_addr -= mapping->offset;
+ if (dma_need_sync(dev->dev, dma_addr))
+ mapping->needs_sync = true;
+
for (j = 0; j < ppv; j++)
mapping->prps[k++] = cpu_to_le64(dma_addr +
j * NVME_CTRL_PAGE_SIZE);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 649348bc03c2..b5277ec189e0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -474,6 +474,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
+void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d370b45d7f1b..ebdf81473526 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1070,6 +1070,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
iov_iter_iovec_advance(i, size);
} else if (iov_iter_is_bvec(i)) {
iov_iter_bvec_advance(i, size);
+ } else if (iov_iter_is_dma_tag(i)) {
+ i->iov_offset += size;
+ i->count -= size;
} else if (iov_iter_is_pipe(i)) {
pipe_advance(i, size);
} else if (unlikely(iov_iter_is_xarray(i))) {
@@ -1077,9 +1080,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
i->count -= size;
} else if (iov_iter_is_discard(i)) {
i->count -= size;
- } else if (iov_iter_is_dma_tag(i)) {
- i->iov_offset += size;
- i->count -= size;
}
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -1353,6 +1353,9 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
if (iov_iter_is_bvec(i))
return iov_iter_aligned_bvec(i, addr_mask, len_mask);
+ if (iov_iter_is_dma_tag(i))
+ return !(i->iov_offset & addr_mask);
+
if (iov_iter_is_pipe(i)) {
unsigned int p_mask = i->pipe->ring_size - 1;
size_t size = i->count;
-- 8< --
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCHv2 0/7] dma mapping optimisations
2022-08-04 16:28 ` Keith Busch
@ 2022-08-04 16:42 ` Jens Axboe
0 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-08-04 16:42 UTC (permalink / raw)
To: Keith Busch
Cc: Keith Busch, linux-nvme, linux-block, io-uring, linux-fsdevel,
hch, Alexander Viro, Kernel Team
On 8/4/22 10:28 AM, Keith Busch wrote:
> On Wed, Aug 03, 2022 at 02:52:11PM -0600, Jens Axboe wrote:
>> I ran this on my test box to see how we'd do. First the bad news:
>> smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
>> and using t/io_uring (with registered buffers, polled, etc) and a 512b
>> block size I get:
>>
>> IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
>> IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
>> IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
>> IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
>> IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
>> IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
>>
>> and adding -D1 I get:
>>
>> IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
>> IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
>> IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
>> IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
>> IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
>> IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
>>
>> which does regress that workload.
>
> Bummer, I would expect -D1 to be no worse. My test isn't nearly as consistent
> as yours, so I'm having some trouble measuring. I'm only coming with a few
> micro-optimizations that might help. A diff is below on top of this series. I
> also created a branch with everything folded in here:
That seemed to do the trick! Don't pay any attention to the numbers
being slightly different than before for -D0, it's a slightly different
kernel. But same test, -d8 -s2 -c2, polled:
-D0 -B1
IOPS=45.39M, BW=22.16GiB/s, IOS/call=1/1
IOPS=46.06M, BW=22.49GiB/s, IOS/call=2/1
IOPS=45.70M, BW=22.31GiB/s, IOS/call=1/1
IOPS=45.71M, BW=22.32GiB/s, IOS/call=2/2
IOPS=45.83M, BW=22.38GiB/s, IOS/call=1/1
IOPS=45.64M, BW=22.29GiB/s, IOS/call=2/2
-D1 -B1
IOPS=45.94M, BW=22.43GiB/s, IOS/call=1/1
IOPS=46.08M, BW=22.50GiB/s, IOS/call=1/1
IOPS=46.27M, BW=22.59GiB/s, IOS/call=2/1
IOPS=45.88M, BW=22.40GiB/s, IOS/call=1/1
IOPS=46.18M, BW=22.55GiB/s, IOS/call=2/1
IOPS=46.13M, BW=22.52GiB/s, IOS/call=2/2
IOPS=46.40M, BW=22.66GiB/s, IOS/call=1/1
which is a smidge higher, and definitely not regressing now.
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread