From: Hao Xu <[email protected]>
To: Jens Axboe <[email protected]>
Cc: [email protected], Pavel Begunkov <[email protected]>,
Joseph Qi <[email protected]>
Subject: [PATCH 2/8] io_uring: add a priority tw list for irq completion work
Date: Wed, 27 Oct 2021 22:02:10 +0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
Now we have a lot of task_work users, some are just to complete a req
and generate a cqe. Let's put the work to a new tw list which has a
higher priority, so that it can be handled quickly and thus to reduce
avg req latency. an explanatory case:
origin timeline:
submit_sqe-->irq-->add completion task_work
-->run heavy work0~n-->run completion task_work
now timeline:
submit_sqe-->irq-->add completion task_work
-->run completion task_work-->run heavy work0~n
Tested this patch(and the following ones) by manually replace
__io_queue_sqe() in io_queue_sqe() by io_req_task_queue() to construct
'heavy' task works. Then test with fio:
ioengine=io_uring
sqpoll=1
thread=1
bs=4k
direct=1
rw=randread
time_based=1
runtime=600
randrepeat=0
group_reporting=1
filename=/dev/nvme0n1
Tried various iodepth.
The peak IOPS for this patch is 710K, while the old one is 665K.
For avg latency, difference shows when iodepth grow:
depth and avg latency(usec):
depth new old
1 7.05 7.10
2 8.47 8.60
4 10.42 10.42
8 13.78 13.22
16 27.41 24.33
32 49.40 53.08
64 102.53 103.36
128 196.98 205.61
256 372.99 414.88
512 747.23 791.30
1024 1472.59 1538.72
2048 3153.49 3329.01
4096 6387.86 6682.54
8192 12150.25 12774.14
16384 23085.58 26044.71
Signed-off-by: Hao Xu <[email protected]>
---
fs/io_uring.c | 38 +++++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 13 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 88c5ee4dc242..9fdac3b9a560 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -467,6 +467,7 @@ struct io_uring_task {
spinlock_t task_lock;
struct io_wq_work_list task_list;
+ struct io_wq_work_list prior_task_list;
struct callback_head task_work;
bool task_running;
};
@@ -2147,13 +2148,17 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
+ struct io_wq_work_list *merged_list;
- if (!tctx->task_list.first && locked)
+ if (!tctx->prior_task_list.first &&
+ !tctx->task_list.first && locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
- node = tctx->task_list.first;
+ merged_list = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
+ node = merged_list->first;
INIT_WQ_LIST(&tctx->task_list);
+ INIT_WQ_LIST(&tctx->prior_task_list);
if (!node)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
@@ -2182,19 +2187,23 @@ static void tctx_task_work(struct callback_head *cb)
ctx_flush_and_put(ctx, &locked);
}
-static void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_task_work_add(struct io_kiocb *req, bool priority)
{
struct task_struct *tsk = req->task;
struct io_uring_task *tctx = tsk->io_uring;
enum task_work_notify_mode notify;
struct io_wq_work_node *node;
+ struct io_wq_work_list *merged_list;
unsigned long flags;
bool running;
WARN_ON_ONCE(!tctx);
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ if (priority)
+ wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
+ else
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -2219,8 +2228,10 @@ static void io_req_task_work_add(struct io_kiocb *req)
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = tctx->task_list.first;
+ merged_list = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
+ node = merged_list->first;
INIT_WQ_LIST(&tctx->task_list);
+ INIT_WQ_LIST(&tctx->prior_task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -2257,19 +2268,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
req->result = ret;
req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, true);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static inline void io_queue_next(struct io_kiocb *req)
@@ -2374,7 +2385,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)
{
if (req_ref_put_and_test(req)) {
req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
}
@@ -2677,7 +2688,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
return;
req->result = res;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, true);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -5248,7 +5259,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return 1;
}
@@ -5916,7 +5927,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_timeout;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -6859,7 +6870,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -8563,6 +8574,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
+ INIT_WQ_LIST(&tctx->prior_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
--
2.24.4
next prev parent reply other threads:[~2021-10-27 14:02 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-10-27 14:02 [PATCH for-5.16 v3 0/8] task work optimization Hao Xu
2021-10-27 14:02 ` [PATCH 1/8] io-wq: add helper to merge two wq_lists Hao Xu
2021-10-27 14:02 ` Hao Xu [this message]
2021-10-27 14:02 ` [PATCH 3/8] io_uring: add helper for task work execution code Hao Xu
2021-10-27 14:02 ` [PATCH 4/8] io_uring: split io_req_complete_post() and add a helper Hao Xu
2021-10-27 14:02 ` [PATCH 5/8] io_uring: move up io_put_kbuf() and io_put_rw_kbuf() Hao Xu
2021-10-27 14:02 ` [PATCH 6/8] io_uring: add nr_ctx to record the number of ctx in a task Hao Xu
2021-10-28 6:27 ` Hao Xu
2021-10-27 14:02 ` [PATCH 7/8] io_uring: batch completion in prior_task_list Hao Xu
2021-10-27 14:02 ` [PATCH 8/8] io_uring: add limited number of TWs to priority task list Hao Xu
2021-10-27 16:39 ` [PATCH for-5.16 v3 0/8] task work optimization Hao Xu
2021-10-27 18:15 ` Pavel Begunkov
2021-10-28 6:07 ` Hao Xu
2021-10-28 11:46 ` Hao Xu
2021-10-28 19:08 ` Pavel Begunkov
2021-10-29 6:18 ` Hao Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox