* [PATCH 1/5] bio: add allocation cache abstraction
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
@ 2021-08-10 16:37 ` Jens Axboe
2021-08-11 8:34 ` Christoph Hellwig
2021-08-10 16:37 ` [PATCH 2/5] io_uring: use kiocb->private to hold rw_len Jens Axboe
` (5 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2021-08-10 16:37 UTC (permalink / raw)
To: io-uring; +Cc: linux-block, Jens Axboe
Add a set of helpers that can encapsulate bio allocations, reusing them
as needed. Caller must provide the necessary locking, if any is needed.
The primary intended use case is polled IO from io_uring, which will not
need any external locking.
Very simple - keeps a count of bio's in the cache, and maintains a max
of 512 with a slack of 64. If we get above max + slack, we drop slack
number of bio's.
The cache is intended to be per-task, and the user will need to supply
the storage for it. As io_uring will be the only user right now, provide
a hook that returns the cache there. Stub it out as NULL initially.
Signed-off-by: Jens Axboe <[email protected]>
---
block/bio.c | 123 ++++++++++++++++++++++++++++++++++++++++----
include/linux/bio.h | 24 +++++++--
2 files changed, 131 insertions(+), 16 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 1fab762e079b..e3680702aeae 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -238,6 +238,35 @@ static void bio_free(struct bio *bio)
}
}
+static inline void __bio_init(struct bio *bio)
+{
+ bio->bi_next = NULL;
+ bio->bi_bdev = NULL;
+ bio->bi_opf = 0;
+ bio->bi_flags = bio->bi_ioprio = bio->bi_write_hint = 0;
+ bio->bi_status = 0;
+ bio->bi_iter.bi_sector = 0;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_idx = 0;
+ bio->bi_iter.bi_bvec_done = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ bio->bi_blkg = NULL;
+ bio->bi_issue.value = 0;
+#ifdef CONFIG_BLK_CGROUP_IOCOST
+ bio->bi_iocost_cost = 0;
+#endif
+#endif
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ bio->bi_crypt_context = NULL;
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ bio->bi_integrity = NULL;
+#endif
+ bio->bi_vcnt = 0;
+}
+
/*
* Users of this function have their own bio allocation. Subsequently,
* they must remember to pair any call to bio_init() with bio_uninit()
@@ -246,7 +275,7 @@ static void bio_free(struct bio *bio)
void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs)
{
- memset(bio, 0, sizeof(*bio));
+ __bio_init(bio);
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
@@ -591,6 +620,19 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
+static bool __bio_put(struct bio *bio)
+{
+ if (!bio_flagged(bio, BIO_REFFED))
+ return true;
+
+ BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+ /*
+ * last put frees it
+ */
+ return atomic_dec_and_test(&bio->__bi_cnt);
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -601,17 +643,8 @@ void guard_bio_eod(struct bio *bio)
**/
void bio_put(struct bio *bio)
{
- if (!bio_flagged(bio, BIO_REFFED))
+ if (__bio_put(bio))
bio_free(bio);
- else {
- BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
-
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->__bi_cnt))
- bio_free(bio);
- }
}
EXPORT_SYMBOL(bio_put);
@@ -1595,6 +1628,74 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
}
EXPORT_SYMBOL(bioset_init_from_src);
+void bio_alloc_cache_init(struct bio_alloc_cache *cache)
+{
+ bio_list_init(&cache->free_list);
+ cache->nr = 0;
+}
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ struct bio *bio;
+ unsigned int i;
+
+ i = 0;
+ while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+ cache->nr--;
+ bio_free(bio);
+ if (++i == nr)
+ break;
+ }
+}
+
+void bio_alloc_cache_destroy(struct bio_alloc_cache *cache)
+{
+ bio_alloc_cache_prune(cache, -1U);
+}
+
+struct bio *bio_cache_get(struct bio_alloc_cache *cache, gfp_t gfp,
+ unsigned short nr_vecs, struct bio_set *bs)
+{
+ struct bio *bio;
+
+ if (nr_vecs > BIO_INLINE_VECS)
+ return NULL;
+ if (bio_list_empty(&cache->free_list)) {
+alloc:
+ if (bs)
+ return bio_alloc_bioset(gfp, nr_vecs, bs);
+ else
+ return bio_alloc(gfp, nr_vecs);
+ }
+
+ bio = bio_list_peek(&cache->free_list);
+ if (bs && bio->bi_pool != bs)
+ goto alloc;
+ bio_list_del_head(&cache->free_list, bio);
+ cache->nr--;
+ bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ return bio;
+}
+
+#define ALLOC_CACHE_MAX 512
+#define ALLOC_CACHE_SLACK 64
+
+void bio_cache_put(struct bio_alloc_cache *cache, struct bio *bio)
+{
+ if (unlikely(!__bio_put(bio)))
+ return;
+ if (cache) {
+ bio_uninit(bio);
+ bio_list_add_head(&cache->free_list, bio);
+ cache->nr++;
+ if (cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+ bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
+ } else {
+ bio_free(bio);
+ }
+}
+
static int __init init_bio(void)
{
int i;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2203b686e1f0..c351aa88d137 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -652,18 +652,22 @@ static inline struct bio *bio_list_peek(struct bio_list *bl)
return bl->head;
}
-static inline struct bio *bio_list_pop(struct bio_list *bl)
+static inline void bio_list_del_head(struct bio_list *bl, struct bio *head)
{
- struct bio *bio = bl->head;
-
- if (bio) {
+ if (head) {
bl->head = bl->head->bi_next;
if (!bl->head)
bl->tail = NULL;
- bio->bi_next = NULL;
+ head->bi_next = NULL;
}
+}
+static inline struct bio *bio_list_pop(struct bio_list *bl)
+{
+ struct bio *bio = bl->head;
+
+ bio_list_del_head(bl, bio);
return bio;
}
@@ -676,6 +680,16 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
return bio;
}
+struct bio_alloc_cache {
+ struct bio_list free_list;
+ unsigned int nr;
+};
+
+void bio_alloc_cache_init(struct bio_alloc_cache *);
+void bio_alloc_cache_destroy(struct bio_alloc_cache *);
+struct bio *bio_cache_get(struct bio_alloc_cache *, gfp_t, unsigned short, struct bio_set *bs);
+void bio_cache_put(struct bio_alloc_cache *, struct bio *);
+
/*
* Increment chain count for the bio. Make sure the CHAIN flag update
* is visible before the raised count.
--
2.32.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 1/5] bio: add allocation cache abstraction
2021-08-10 16:37 ` [PATCH 1/5] bio: add allocation cache abstraction Jens Axboe
@ 2021-08-11 8:34 ` Christoph Hellwig
0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2021-08-11 8:34 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, linux-block
> +static inline void __bio_init(struct bio *bio)
> +{
> +}
> +
> /*
> * Users of this function have their own bio allocation. Subsequently,
> * they must remember to pair any call to bio_init() with bio_uninit()
> @@ -246,7 +275,7 @@ static void bio_free(struct bio *bio)
> void bio_init(struct bio *bio, struct bio_vec *table,
> unsigned short max_vecs)
> {
> - memset(bio, 0, sizeof(*bio));
> + __bio_init(bio);
Please split this into a separate, well-documented prep patch.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 2/5] io_uring: use kiocb->private to hold rw_len
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
2021-08-10 16:37 ` [PATCH 1/5] bio: add allocation cache abstraction Jens Axboe
@ 2021-08-10 16:37 ` Jens Axboe
2021-08-11 11:40 ` Christoph Hellwig
2021-08-10 16:37 ` [PATCH 3/5] fs: add ki_bio_cache pointer to struct kiocb Jens Axboe
` (4 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2021-08-10 16:37 UTC (permalink / raw)
To: io-uring; +Cc: linux-block, Jens Axboe
We don't need a separate member in io_rw for this, we can just use the
kiocb->private field as we're not using it for anything else anyway.
This saves 8 bytes in io_rw, which we'll be needing once kiocb grows
a new member.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 27 +++++++++++++++++++--------
1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91a301bb1644..f35b54f016f3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -557,7 +557,6 @@ struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
u64 addr;
- u64 len;
};
struct io_connect {
@@ -2675,6 +2674,16 @@ static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
return __io_file_supports_nowait(req->file, rw);
}
+static inline void *u64_to_ptr(__u64 ptr)
+{
+ return (void *)(unsigned long) ptr;
+}
+
+static inline __u64 ptr_to_u64(void *ptr)
+{
+ return (__u64)(unsigned long)ptr;
+}
+
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2732,7 +2741,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->rw.addr = READ_ONCE(sqe->addr);
- req->rw.len = READ_ONCE(sqe->len);
+ req->rw.kiocb.private = u64_to_ptr(READ_ONCE(sqe->len));
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -2799,7 +2808,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
struct io_mapped_ubuf *imu)
{
- size_t len = req->rw.len;
+ size_t len = ptr_to_u64(req->rw.kiocb.private);
u64 buf_end, buf_addr = req->rw.addr;
size_t offset;
@@ -2997,7 +3006,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
iov[0].iov_len = kbuf->len;
return 0;
}
- if (req->rw.len != 1)
+ if (ptr_to_u64(req->rw.kiocb.private) != 1)
return -EINVAL;
#ifdef CONFIG_COMPAT
@@ -3012,7 +3021,7 @@ static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
struct iov_iter *iter, bool needs_lock)
{
void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ size_t sqe_len = ptr_to_u64(req->rw.kiocb.private);
u8 opcode = req->opcode;
ssize_t ret;
@@ -3030,7 +3039,7 @@ static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
if (IS_ERR(buf))
return PTR_ERR(buf);
- req->rw.len = sqe_len;
+ req->rw.kiocb.private = u64_to_ptr(sqe_len);
}
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
@@ -3063,6 +3072,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
{
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
+ unsigned long rw_len;
ssize_t ret = 0;
/*
@@ -3075,6 +3085,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (kiocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
+ rw_len = ptr_to_u64(req->rw.kiocb.private);
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3083,7 +3094,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
iovec = iov_iter_iovec(iter);
} else {
iovec.iov_base = u64_to_user_ptr(req->rw.addr);
- iovec.iov_len = req->rw.len;
+ iovec.iov_len = rw_len;
}
if (rw == READ) {
@@ -3102,7 +3113,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret += nr;
if (nr != iovec.iov_len)
break;
- req->rw.len -= nr;
+ rw_len -= nr;
req->rw.addr += nr;
iov_iter_advance(iter, nr);
}
--
2.32.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 3/5] fs: add ki_bio_cache pointer to struct kiocb
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
2021-08-10 16:37 ` [PATCH 1/5] bio: add allocation cache abstraction Jens Axboe
2021-08-10 16:37 ` [PATCH 2/5] io_uring: use kiocb->private to hold rw_len Jens Axboe
@ 2021-08-10 16:37 ` Jens Axboe
2021-08-10 16:37 ` [PATCH 4/5] io_uring: wire up bio allocation cache Jens Axboe
` (3 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2021-08-10 16:37 UTC (permalink / raw)
To: io-uring; +Cc: linux-block, Jens Axboe
This allows an issuer (and owner of a kiocb) to pass in a bio allocation
cache that can be used to improve the efficiency of the churn of repeated
bio allocations and frees.
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/fs.h | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 640574294216..5f17d10ddc2d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -291,6 +291,7 @@ struct page;
struct address_space;
struct writeback_control;
struct readahead_control;
+struct bio_alloc_cache;
/*
* Write life time hint values.
@@ -319,6 +320,8 @@ enum rw_hint {
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ (1 << 19)
#define IOCB_NOIO (1 << 20)
+/* iocb->ki_bio_cache is valid */
+#define IOCB_ALLOC_CACHE (1 << 21)
struct kiocb {
struct file *ki_filp;
@@ -337,6 +340,14 @@ struct kiocb {
struct wait_page_queue *ki_waitq; /* for async buffered IO */
};
+ /*
+ * If set, owner of iov_iter can pass in a fast-cache for bio
+ * allocations.
+ */
+#ifdef CONFIG_BLOCK
+ struct bio_alloc_cache *ki_bio_cache;
+#endif
+
randomized_struct_fields_end
};
--
2.32.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 4/5] io_uring: wire up bio allocation cache
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
` (2 preceding siblings ...)
2021-08-10 16:37 ` [PATCH 3/5] fs: add ki_bio_cache pointer to struct kiocb Jens Axboe
@ 2021-08-10 16:37 ` Jens Axboe
2021-08-10 16:37 ` [PATCH 5/5] block: enable use of " Jens Axboe
` (2 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2021-08-10 16:37 UTC (permalink / raw)
To: io-uring; +Cc: linux-block, Jens Axboe
Initialize a bio allocation cache, and mark it as being used for
IOPOLL. We could use it for non-polled IO as well, but it'd need some
locking and probably would negate much of the win in that case.
We start with IOPOLL, as completions are locked by the ctx lock anyway.
So no further locking is needed there.
This brings an IOPOLL gen2 Optane QD=128 workload from ~3.0M IOPS to
~3.3M IOPS.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f35b54f016f3..60316cfc712a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -324,6 +324,10 @@ struct io_submit_state {
/* inline/task_work completion list, under ->uring_lock */
struct list_head free_list;
+#ifdef CONFIG_BLOCK
+ struct bio_alloc_cache bio_cache;
+#endif
+
/*
* File reference cache
*/
@@ -1201,6 +1205,9 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
INIT_LIST_HEAD(&ctx->submit_state.free_list);
+#ifdef CONFIG_BLOCK
+ bio_alloc_cache_init(&ctx->submit_state.bio_cache);
+#endif
INIT_LIST_HEAD(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
return ctx;
@@ -2267,6 +2274,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
!(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
+ /* Don't use cache for async retry, not locking safe */
+ req->rw.kiocb.ki_flags &= ~IOCB_ALLOC_CACHE;
req_ref_get(req);
io_req_task_queue_reissue(req);
continue;
@@ -2684,6 +2693,31 @@ static inline __u64 ptr_to_u64(void *ptr)
return (__u64)(unsigned long)ptr;
}
+static void io_mark_alloc_cache(struct io_kiocb *req)
+{
+#ifdef CONFIG_BLOCK
+ struct kiocb *kiocb = &req->rw.kiocb;
+ struct block_device *bdev = NULL;
+
+ if (S_ISBLK(file_inode(kiocb->ki_filp)->i_mode))
+ bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+ else if (S_ISREG(file_inode(kiocb->ki_filp)->i_mode))
+ bdev = kiocb->ki_filp->f_inode->i_sb->s_bdev;
+
+ /*
+ * If the lower level device doesn't support polled IO, then
+ * we cannot safely use the alloc cache. This really should
+ * be a failure case for polled IO...
+ */
+ if (!bdev ||
+ !test_bit(QUEUE_FLAG_POLL, &bdev_get_queue(bdev)->queue_flags))
+ return;
+
+ kiocb->ki_flags |= IOCB_ALLOC_CACHE;
+ kiocb->ki_bio_cache = &req->ctx->submit_state.bio_cache;
+#endif /* CONFIG_BLOCK */
+}
+
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2726,6 +2760,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HIPRI;
+ io_mark_alloc_cache(req);
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
} else {
@@ -2792,6 +2827,8 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (check_reissue && (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
+ /* Don't use cache for async retry, not locking safe */
+ req->rw.kiocb.ki_flags &= ~IOCB_ALLOC_CACHE;
req_ref_get(req);
io_req_task_queue_reissue(req);
} else {
@@ -8640,6 +8677,9 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
state->free_reqs = 0;
}
+#ifdef CONFIG_BLOCK
+ bio_alloc_cache_destroy(&state->bio_cache);
+#endif
io_flush_cached_locked_reqs(ctx, state);
io_req_cache_free(&state->free_list);
mutex_unlock(&ctx->uring_lock);
--
2.32.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 5/5] block: enable use of bio allocation cache
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
` (3 preceding siblings ...)
2021-08-10 16:37 ` [PATCH 4/5] io_uring: wire up bio allocation cache Jens Axboe
@ 2021-08-10 16:37 ` Jens Axboe
2021-08-10 16:44 ` [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
2021-08-11 8:26 ` Christoph Hellwig
6 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2021-08-10 16:37 UTC (permalink / raw)
To: io-uring; +Cc: linux-block, Jens Axboe
If the kiocb passed in has a bio cache specified, then use that to
allocate a (and free) new bio if possible.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/block_dev.c | 32 ++++++++++++++++++++++++++++----
1 file changed, 28 insertions(+), 4 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9ef4f1fc2cb0..a192c5672430 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -327,6 +327,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}
+static void dio_bio_put(struct blkdev_dio *dio)
+{
+ if (dio->iocb->ki_flags & IOCB_ALLOC_CACHE)
+ bio_cache_put(dio->iocb->ki_bio_cache, &dio->bio);
+ else
+ bio_put(&dio->bio);
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
@@ -362,7 +370,7 @@ static void blkdev_bio_end_io(struct bio *bio)
bio_check_pages_dirty(bio);
} else {
bio_release_pages(bio, false);
- bio_put(bio);
+ dio_bio_put(dio);
}
}
@@ -385,7 +393,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+ bio = NULL;
+ if (iocb->ki_flags & IOCB_ALLOC_CACHE) {
+ bio = bio_cache_get(iocb->ki_bio_cache, GFP_KERNEL, nr_pages,
+ &blkdev_dio_pool);
+ if (!bio)
+ iocb->ki_flags &= ~IOCB_ALLOC_CACHE;
+ }
+ if (!bio)
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -467,7 +483,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = NULL;
+ if (iocb->ki_flags & IOCB_ALLOC_CACHE) {
+ bio = bio_cache_get(iocb->ki_bio_cache, GFP_KERNEL,
+ nr_pages, &fs_bio_set);
+ if (!bio)
+ iocb->ki_flags &= ~IOCB_ALLOC_CACHE;
+ }
+ if (!bio)
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
}
if (!is_poll)
@@ -492,7 +516,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (likely(!ret))
ret = dio->size;
- bio_put(&dio->bio);
+ dio_bio_put(dio);
return ret;
}
--
2.32.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCHSET v3 0/5] Enable bio recycling for polled IO
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
` (4 preceding siblings ...)
2021-08-10 16:37 ` [PATCH 5/5] block: enable use of " Jens Axboe
@ 2021-08-10 16:44 ` Jens Axboe
2021-08-11 8:26 ` Christoph Hellwig
6 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2021-08-10 16:44 UTC (permalink / raw)
To: io-uring; +Cc: linux-block
On 8/10/21 10:37 AM, Jens Axboe wrote:
> Hi,
>
> This is v3 of this patchset. We're back to passing the cache pointer
> in the kiocb, I do think that's the cleanest and it's also the most
> efficient approach. A patch has been added to remove a member from
> the io_uring req_rw structure, so that the kiocb size bump doesn't
> result in the per-command part of io_kiocb to bump into the next
> cacheline.
>
> Another benefit of this approach is that we get per-ring caching.
> That means if an application splits polled IO into two threads, one
> doing submit and one doing reaps, then we still get the full benefit
> of the bio caching.
>
> The tldr; here is that we get about a 10% bump in polled performance with
> this patchset, as we can recycle bio structures essentially for free.
> Outside of that, explanations in each patch. I've also got an iomap patch,
> but trying to keep this single user until there's agreement on the
> direction.
>
> Against for-5.15/io_uring, and can also be found in my
> io_uring-bio-cache.3 branch.
As a reference. Before the patch:
axboe@amd ~/g/fio (master)> sudo taskset -c 0 t/io_uring -b512 -d128 -s32 -c32 -p1 -F1 -B1 /dev/nvme3n1
i 8, argc 9
Added file /dev/nvme3n1 (submitter 0)
sq_ring ptr = 0x0x7f63a7c42000
sqes ptr = 0x0x7f63a7c40000
cq_ring ptr = 0x0x7f63a7c3e000
polled=1, fixedbufs=1, register_files=1, buffered=0 QD=128, sq_ring=128, cq_ring=256
submitter=1600
IOPS=3111520, IOS/call=32/31, inflight=128 (128)
or around 3.1M IOPS (single thread, single core), and after:
axboe@amd ~/g/fio (master)> sudo taskset -c 0 t/io_uring -b512 -d128 -s32 -c32 -p1 -F1 -B1 /dev/nvme3n1
i 8, argc 9
Added file /dev/nvme3n1 (submitter 0)
sq_ring ptr = 0x0x7f62726bc000
sqes ptr = 0x0x7f62726ba000
cq_ring ptr = 0x0x7f62726b8000
polled=1, fixedbufs=1, register_files=1, buffered=0 QD=128, sq_ring=128, cq_ring=256
submitter=1791
IOPS=3417120, IOS/call=32/31, inflight=128 (128)
which is about a ~10% increase in per-core IOPS for this kind of
workload.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCHSET v3 0/5] Enable bio recycling for polled IO
2021-08-10 16:37 [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
` (5 preceding siblings ...)
2021-08-10 16:44 ` [PATCHSET v3 0/5] Enable bio recycling for polled IO Jens Axboe
@ 2021-08-11 8:26 ` Christoph Hellwig
2021-08-11 11:13 ` Christoph Hellwig
6 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2021-08-11 8:26 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, linux-block
I really don't like all the layering violations in here. What is the
problem with a simple (optional) percpu cache in the bio_set? Something
like the completely untested patch below:
diff --git a/block/bio.c b/block/bio.c
index 33160007f4e0..edd4a83b96fa 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -25,6 +25,11 @@
#include "blk.h"
#include "blk-rq-qos.h"
+struct bio_alloc_cache {
+ struct bio_list free_list;
+ unsigned int nr;
+};
+
static struct biovec_slab {
int nr_vecs;
char *name;
@@ -239,6 +244,35 @@ static void bio_free(struct bio *bio)
}
}
+static inline void __bio_init(struct bio *bio)
+{
+ bio->bi_next = NULL;
+ bio->bi_bdev = NULL;
+ bio->bi_opf = 0;
+ bio->bi_flags = bio->bi_ioprio = bio->bi_write_hint = 0;
+ bio->bi_status = 0;
+ bio->bi_iter.bi_sector = 0;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_idx = 0;
+ bio->bi_iter.bi_bvec_done = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ bio->bi_blkg = NULL;
+ bio->bi_issue.value = 0;
+#ifdef CONFIG_BLK_CGROUP_IOCOST
+ bio->bi_iocost_cost = 0;
+#endif
+#endif
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ bio->bi_crypt_context = NULL;
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ bio->bi_integrity = NULL;
+#endif
+ bio->bi_vcnt = 0;
+}
+
/*
* Users of this function have their own bio allocation. Subsequently,
* they must remember to pair any call to bio_init() with bio_uninit()
@@ -247,7 +281,7 @@ static void bio_free(struct bio *bio)
void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs)
{
- memset(bio, 0, sizeof(*bio));
+ __bio_init(bio);
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
bio->bi_cookie = BLK_QC_T_NONE;
@@ -470,6 +504,31 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
}
EXPORT_SYMBOL(bio_alloc_bioset);
+struct bio *bio_alloc_iocb(struct kiocb *iocb, unsigned short nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_alloc_cache *cache = NULL;
+ struct bio *bio;
+
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !(iocb->ki_flags & IOCB_NOWAIT) ||
+ nr_vecs > BIO_INLINE_VECS)
+ return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+
+ cache = per_cpu_ptr(bs->cache, get_cpu());
+ bio = bio_list_pop(&cache->free_list);
+ if (bio) {
+ bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ cache->nr--;
+ }
+ put_cpu();
+
+ if (!bio)
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ bio_set_flag(bio, BIO_CACHEABLE);
+ return bio;
+}
+
/**
* bio_kmalloc - kmalloc a bio for I/O
* @gfp_mask: the GFP_* mask given to the slab allocator
@@ -588,6 +647,46 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
+#define ALLOC_CACHE_MAX 512
+#define ALLOC_CACHE_SLACK 64
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ struct bio *bio;
+ unsigned int i;
+
+ i = 0;
+ while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+ cache->nr--;
+ bio_free(bio);
+ if (++i == nr)
+ break;
+ }
+}
+
+#if 0
+// XXX: add a cpu down notifier to call this
+void bio_alloc_cache_destroy(struct bio_alloc_cache *cache)
+{
+ bio_alloc_cache_prune(cache, -1U);
+}
+#endif
+
+static void bio_add_to_cache(struct bio *bio)
+{
+ struct bio_alloc_cache *cache;
+
+ bio_uninit(bio);
+
+ cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ bio_list_add_head(&cache->free_list, bio);
+ cache->nr++;
+ if (cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+ bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
+ put_cpu();
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -598,17 +697,16 @@ void guard_bio_eod(struct bio *bio)
**/
void bio_put(struct bio *bio)
{
- if (!bio_flagged(bio, BIO_REFFED))
- bio_free(bio);
- else {
+ if (bio_flagged(bio, BIO_REFFED)) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
-
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->__bi_cnt))
- bio_free(bio);
+ if (!atomic_dec_and_test(&bio->__bi_cnt))
+ return;
}
+
+ if (bio_flagged(bio, BIO_CACHEABLE))
+ bio_add_to_cache(bio);
+ else
+ bio_free(bio);
}
EXPORT_SYMBOL(bio_put);
@@ -1487,6 +1585,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
*/
void bioset_exit(struct bio_set *bs)
{
+ free_percpu(bs->cache);
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
bs->rescue_workqueue = NULL;
@@ -1548,12 +1647,18 @@ int bioset_init(struct bio_set *bs,
biovec_init_pool(&bs->bvec_pool, pool_size))
goto bad;
- if (!(flags & BIOSET_NEED_RESCUER))
- return 0;
-
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
+ if (flags & BIOSET_NEED_RESCUER) {
+ bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM,
+ 0);
+ if (!bs->rescue_workqueue)
+ goto bad;
+ }
+
+ if (flags & BIOSET_PERCPU_CACHE) {
+ bs->cache = alloc_percpu(struct bio_alloc_cache);
+ if (!bs->cache)
+ goto bad;
+ }
return 0;
bad:
@@ -1594,7 +1699,8 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
- if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
+ if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
+ BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n");
if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e95889ff4fba..c67043bfb788 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -376,8 +376,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
-
+ bio = bio_alloc_iocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
if (dio->is_sync) {
@@ -452,7 +451,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = bio_alloc_iocb(iocb, nr_pages, &fs_bio_set);
}
if (!(iocb->ki_flags & IOCB_HIPRI))
@@ -497,7 +496,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static __init int blkdev_init(void)
{
- return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
+ return bioset_init(&blkdev_dio_pool, 4,
+ offsetof(struct blkdev_dio, bio),
+ BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE);
}
module_init(blkdev_init);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 35de19f2ae88..69850bfddf18 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -400,6 +400,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
enum {
BIOSET_NEED_BVECS = BIT(0),
BIOSET_NEED_RESCUER = BIT(1),
+ BIOSET_PERCPU_CACHE = BIT(2),
};
extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
extern void bioset_exit(struct bio_set *);
@@ -656,7 +657,7 @@ static inline void bio_inc_remaining(struct bio *bio)
struct bio_set {
struct kmem_cache *bio_slab;
unsigned int front_pad;
-
+ struct bio_alloc_cache __percpu *cache;
mempool_t bio_pool;
mempool_t bvec_pool;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e3a70dd0470b..7a7d9c6b33ee 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -300,6 +300,7 @@ enum {
BIO_TRACKED, /* set if bio goes through the rq_qos path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_CACHEABLE, /* can be added to the percpu cache */
BIO_FLAG_LAST
};
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCHSET v3 0/5] Enable bio recycling for polled IO
2021-08-11 8:26 ` Christoph Hellwig
@ 2021-08-11 11:13 ` Christoph Hellwig
2021-08-11 15:05 ` Jens Axboe
2021-08-11 15:06 ` Ming Lei
0 siblings, 2 replies; 14+ messages in thread
From: Christoph Hellwig @ 2021-08-11 11:13 UTC (permalink / raw)
To: Jens Axboe; +Cc: io-uring, linux-block
On Wed, Aug 11, 2021 at 09:26:33AM +0100, Christoph Hellwig wrote:
> I really don't like all the layering violations in here. What is the
> problem with a simple (optional) percpu cache in the bio_set? Something
> like the completely untested patch below:
A slightly updated version that actually compiles and survives minimal
testing is here:
http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/bio-cache
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCHSET v3 0/5] Enable bio recycling for polled IO
2021-08-11 11:13 ` Christoph Hellwig
@ 2021-08-11 15:05 ` Jens Axboe
2021-08-11 15:08 ` Christoph Hellwig
2021-08-11 15:06 ` Ming Lei
1 sibling, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2021-08-11 15:05 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: io-uring, linux-block
On 8/11/21 5:13 AM, Christoph Hellwig wrote:
> On Wed, Aug 11, 2021 at 09:26:33AM +0100, Christoph Hellwig wrote:
>> I really don't like all the layering violations in here. What is the
>> problem with a simple (optional) percpu cache in the bio_set? Something
>> like the completely untested patch below:
>
> A slightly updated version that actually compiles and survives minimal
> testing is here:
>
> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/bio-cache
I like this approach, I've done something very similar in the past. But
the key here is the opt-in, to avoid needing IRQ/locking for the cache
which defeats the purpose. That's why it was done just for polling,
obviously.
I'll test a bit and re-spin the series.
--
Jens Axboe
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCHSET v3 0/5] Enable bio recycling for polled IO
2021-08-11 15:05 ` Jens Axboe
@ 2021-08-11 15:08 ` Christoph Hellwig
0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2021-08-11 15:08 UTC (permalink / raw)
To: Jens Axboe; +Cc: Christoph Hellwig, io-uring, linux-block
On Wed, Aug 11, 2021 at 09:05:47AM -0600, Jens Axboe wrote:
> On 8/11/21 5:13 AM, Christoph Hellwig wrote:
> > On Wed, Aug 11, 2021 at 09:26:33AM +0100, Christoph Hellwig wrote:
> >> I really don't like all the layering violations in here. What is the
> >> problem with a simple (optional) percpu cache in the bio_set? Something
> >> like the completely untested patch below:
> >
> > A slightly updated version that actually compiles and survives minimal
> > testing is here:
> >
> > http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/bio-cache
>
> I like this approach, I've done something very similar in the past. But
> the key here is the opt-in, to avoid needing IRQ/locking for the cache
> which defeats the purpose. That's why it was done just for polling,
> obviously.
>
> I'll test a bit and re-spin the series.
The series above now has the opt-in as I realized the same after a
little testing.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCHSET v3 0/5] Enable bio recycling for polled IO
2021-08-11 11:13 ` Christoph Hellwig
2021-08-11 15:05 ` Jens Axboe
@ 2021-08-11 15:06 ` Ming Lei
1 sibling, 0 replies; 14+ messages in thread
From: Ming Lei @ 2021-08-11 15:06 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Jens Axboe, io-uring, linux-block
On Wed, Aug 11, 2021 at 12:13:23PM +0100, Christoph Hellwig wrote:
> On Wed, Aug 11, 2021 at 09:26:33AM +0100, Christoph Hellwig wrote:
> > I really don't like all the layering violations in here. What is the
> > problem with a simple (optional) percpu cache in the bio_set? Something
> > like the completely untested patch below:
>
> A slightly updated version that actually compiles and survives minimal
> testing is here:
>
> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/bio-cache
Just wondering if the percpu bio cache may perform better than slab which
should be very similar with percpu allocation. And here the cached bio is
just one fixed length bio with inline bvecs.
BTW, in the patch of 'io_uring: ask for bio caching', you have to not
set IOCB_ALLOC_CACHE just like what Jens did, otherwise it will break
when using io_uring over queue without POLL capability.
Thanks,
Ming
^ permalink raw reply [flat|nested] 14+ messages in thread