* [PATCH] io_uring: set plug tags for same file
@ 2023-05-04 16:24 Keith Busch
2023-05-30 20:45 ` Keith Busch
0 siblings, 1 reply; 3+ messages in thread
From: Keith Busch @ 2023-05-04 16:24 UTC (permalink / raw)
To: axboe, asml.silence, io-uring, linux-block; +Cc: Keith Busch
From: Keith Busch <[email protected]>
io_uring tries to optimize allocating tags by hinting to the plug how
many it expects to need for a batch instead of allocating each tag
individually. But io_uring submission queueus may have a mix of many
devices for io, so the number of io's counted may be overestimated. This
can lead to allocating too many tags, which adds overhead to finding
that many contiguous tags, freeing up the ones we didn't use, and may
starve out other users that can actually use them.
When starting a new batch of uring commands, count only commands that
match the file descriptor of the first seen for this optimization.
Signed-off-by: Keith Busch <[email protected]>
---
block/blk-core.c | 49 +++++++++++++++-------------------
include/linux/blkdev.h | 6 -----
include/linux/io_uring_types.h | 1 +
io_uring/io_uring.c | 24 ++++++++++++++---
4 files changed, 43 insertions(+), 37 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 00c74330fa92c..28f1755f37901 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1039,32 +1039,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
-void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
-{
- struct task_struct *tsk = current;
-
- /*
- * If this is a nested plug, don't actually assign it.
- */
- if (tsk->plug)
- return;
-
- plug->mq_list = NULL;
- plug->cached_rq = NULL;
- plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
- plug->rq_count = 0;
- plug->multiple_queues = false;
- plug->has_elevator = false;
- plug->nowait = false;
- INIT_LIST_HEAD(&plug->cb_list);
-
- /*
- * Store ordering should not be needed here, since a potential
- * preempt will imply a full memory barrier
- */
- tsk->plug = plug;
-}
-
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
@@ -1090,7 +1064,28 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
*/
void blk_start_plug(struct blk_plug *plug)
{
- blk_start_plug_nr_ios(plug, 1);
+ struct task_struct *tsk = current;
+
+ /*
+ * If this is a nested plug, don't actually assign it.
+ */
+ if (tsk->plug)
+ return;
+
+ plug->mq_list = NULL;
+ plug->cached_rq = NULL;
+ plug->nr_ios = 1;
+ plug->rq_count = 0;
+ plug->multiple_queues = false;
+ plug->has_elevator = false;
+ plug->nowait = false;
+ INIT_LIST_HEAD(&plug->cb_list);
+
+ /*
+ * Store ordering should not be needed here, since a potential
+ * preempt will imply a full memory barrier
+ */
+ tsk->plug = plug;
}
EXPORT_SYMBOL(blk_start_plug);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e3242e67a8e3d..1e7cb26e04cb1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -995,7 +995,6 @@ struct blk_plug_cb {
extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
void *data, int size);
extern void blk_start_plug(struct blk_plug *);
-extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
extern void blk_finish_plug(struct blk_plug *);
void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
@@ -1011,11 +1010,6 @@ long nr_blockdev_pages(void);
struct blk_plug {
};
-static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
- unsigned short nr_ios)
-{
-}
-
static inline void blk_start_plug(struct blk_plug *plug)
{
}
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1b2a20a42413a..dd2fe648b6392 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -175,6 +175,7 @@ struct io_submit_state {
bool need_plug;
unsigned short submit_nr;
unsigned int cqes_count;
+ int fd;
struct blk_plug plug;
struct io_uring_cqe cqes[16];
};
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda4..ff60342a8f43b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2276,7 +2276,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (state->need_plug && def->plug) {
state->plug_started = true;
state->need_plug = false;
- blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ state->fd = req->cqe.fd;
+ blk_start_plug(&state->plug);
+ } else if (state->plug_started && req->cqe.fd == state->fd &&
+ !ctx->submit_state.link.head) {
+ state->plug.nr_ios++;
}
}
@@ -2337,7 +2341,8 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
}
static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ const struct io_uring_sqe *sqe,
+ struct io_wq_work_list *req_list)
__must_hold(&ctx->uring_lock)
{
struct io_submit_link *link = &ctx->submit_state.link;
@@ -2385,7 +2390,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0;
}
- io_queue_sqe(req);
+ wq_list_add_tail(&req->comp_list, req_list);
return 0;
}
@@ -2470,6 +2475,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
unsigned int entries = io_sqring_entries(ctx);
+ struct io_wq_work_list req_list;
unsigned int left;
int ret;
@@ -2480,6 +2486,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
io_get_task_refs(left);
io_submit_state_start(&ctx->submit_state, left);
+ INIT_WQ_LIST(&req_list);
do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
@@ -2495,13 +2502,22 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
* Continue submitting even for sqe failure if the
* ring was setup with IORING_SETUP_SUBMIT_ALL
*/
- if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+ if (unlikely(io_submit_sqe(ctx, req, sqe, &req_list)) &&
!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
left--;
break;
}
} while (--left);
+ while (req_list.first) {
+ struct io_kiocb *req;
+
+ req = container_of(req_list.first, struct io_kiocb, comp_list);
+ req_list.first = req->comp_list.next;
+
+ io_queue_sqe(req);
+ }
+
if (unlikely(left)) {
ret -= left;
/* try again if it submitted nothing and can't allocate a req */
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] io_uring: set plug tags for same file
2023-05-04 16:24 [PATCH] io_uring: set plug tags for same file Keith Busch
@ 2023-05-30 20:45 ` Keith Busch
2023-05-31 15:44 ` Jens Axboe
0 siblings, 1 reply; 3+ messages in thread
From: Keith Busch @ 2023-05-30 20:45 UTC (permalink / raw)
To: Keith Busch; +Cc: axboe, asml.silence, io-uring, linux-block
On Thu, May 04, 2023 at 09:24:27AM -0700, Keith Busch wrote:
> From: Keith Busch <[email protected]>
>
> io_uring tries to optimize allocating tags by hinting to the plug how
> many it expects to need for a batch instead of allocating each tag
> individually. But io_uring submission queueus may have a mix of many
> devices for io, so the number of io's counted may be overestimated. This
> can lead to allocating too many tags, which adds overhead to finding
> that many contiguous tags, freeing up the ones we didn't use, and may
> starve out other users that can actually use them.
When running batched IO to multiple nvme drives, like with t/io_uring,
this shows a tiny improvement in CPU utilization from avoiding the
unlikely clean up condition in __blk_flush_plug() shown below:
if (unlikely(!rq_list_empty(plug->cached_rq)))
blk_mq_free_plug_rqs(plug);
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] io_uring: set plug tags for same file
2023-05-30 20:45 ` Keith Busch
@ 2023-05-31 15:44 ` Jens Axboe
0 siblings, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2023-05-31 15:44 UTC (permalink / raw)
To: Keith Busch, Keith Busch; +Cc: asml.silence, io-uring, linux-block
On 5/30/23 2:45?PM, Keith Busch wrote:
> On Thu, May 04, 2023 at 09:24:27AM -0700, Keith Busch wrote:
>> From: Keith Busch <[email protected]>
>>
>> io_uring tries to optimize allocating tags by hinting to the plug how
>> many it expects to need for a batch instead of allocating each tag
>> individually. But io_uring submission queueus may have a mix of many
>> devices for io, so the number of io's counted may be overestimated. This
>> can lead to allocating too many tags, which adds overhead to finding
>> that many contiguous tags, freeing up the ones we didn't use, and may
>> starve out other users that can actually use them.
>
> When running batched IO to multiple nvme drives, like with t/io_uring,
> this shows a tiny improvement in CPU utilization from avoiding the
> unlikely clean up condition in __blk_flush_plug() shown below:
>
> if (unlikely(!rq_list_empty(plug->cached_rq)))
> blk_mq_free_plug_rqs(plug);
Conceptually the patch certainly makes sense, and is probably as close
to ideal on tag reservations as we can get without making things a lot
more complicated (or relying on extra app input).
But it does mean we'll always be doing a second loop over the submission
list, which is obviously not ideal in terms of overhead. I can see a few
potential ways forward:
1) Maybe just submit directly if plugging isn't required? That'd leave
the pending list just the items that do plug, and avoid the extra
loop for those that don't.
2) At least get something out of the extra loop - alloc and prep
requests upfront, then submit. If we have to do N things to submit a
request, it's always going to be better to bundle each sub-step. This
patch doesn't do that, it just kind of takes the easy way out.
What do you think? I think these are important questions to answer,
particularly as the overhead of flushing extraneous tags seem pretty
minuscule.
--
Jens Axboe
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-05-31 15:44 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-05-04 16:24 [PATCH] io_uring: set plug tags for same file Keith Busch
2023-05-30 20:45 ` Keith Busch
2023-05-31 15:44 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox