From: Hao Xu <[email protected]>
To: Jens Axboe <[email protected]>
Cc: [email protected], Pavel Begunkov <[email protected]>,
Joseph Qi <[email protected]>
Subject: [PATCH 2/5] io_uring: add a priority tw list for irq completion work
Date: Tue, 7 Dec 2021 17:39:48 +0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
Now we have a lot of task_work users, some are just to complete a req
and generate a cqe. Let's put the work to a new tw list which has a
higher priority, so that it can be handled quickly and thus to reduce
avg req latency and users can issue next round of sqes earlier.
An explanatory case:
origin timeline:
submit_sqe-->irq-->add completion task_work
-->run heavy work0~n-->run completion task_work
now timeline:
submit_sqe-->irq-->add completion task_work
-->run completion task_work-->run heavy work0~n
Limitation: this optimization is only for those that submission and
reaping process are in different threads. Otherwise anyhow we have to
submit new sqes after returning to userspace, then the order of TWs
doesn't matter.
Tested this patch(and the following ones) by manually replace
__io_queue_sqe() in io_queue_sqe() by io_req_task_queue() to construct
'heavy' task works. Then test with fio:
ioengine=io_uring
sqpoll=1
thread=1
bs=4k
direct=1
rw=randread
time_based=1
runtime=600
randrepeat=0
group_reporting=1
filename=/dev/nvme0n1
Tried various iodepth.
The peak IOPS for this patch is 710K, while the old one is 665K.
For avg latency, difference shows when iodepth grow:
depth and avg latency(usec):
depth new old
1 7.05 7.10
2 8.47 8.60
4 10.42 10.42
8 13.78 13.22
16 27.41 24.33
32 49.40 53.08
64 102.53 103.36
128 196.98 205.61
256 372.99 414.88
512 747.23 791.30
1024 1472.59 1538.72
2048 3153.49 3329.01
4096 6387.86 6682.54
8192 12150.25 12774.14
16384 23085.58 26044.71
Signed-off-by: Hao Xu <[email protected]>
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 36 ++++++++++++++++++++----------------
1 file changed, 20 insertions(+), 16 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1265dc1942eb..ad389466a912 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -474,6 +474,7 @@ struct io_uring_task {
spinlock_t task_lock;
struct io_wq_work_list task_list;
+ struct io_wq_work_list prior_task_list;
struct callback_head task_work;
bool task_running;
};
@@ -2226,12 +2227,12 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
- if (!tctx->task_list.first && locked)
+ if (!tctx->prior_task_list.first &&
+ !tctx->task_list.first && locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
+ node= wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
if (!node)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
@@ -2260,7 +2261,7 @@ static void tctx_task_work(struct callback_head *cb)
ctx_flush_and_put(ctx, &locked);
}
-static void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_task_work_add(struct io_kiocb *req, bool priority)
{
struct task_struct *tsk = req->task;
struct io_uring_task *tctx = tsk->io_uring;
@@ -2272,7 +2273,10 @@ static void io_req_task_work_add(struct io_kiocb *req)
WARN_ON_ONCE(!tctx);
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ if (priority)
+ wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
+ else
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -2297,8 +2301,7 @@ static void io_req_task_work_add(struct io_kiocb *req)
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
+ node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -2335,19 +2338,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
req->result = ret;
req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static inline void io_queue_next(struct io_kiocb *req)
@@ -2457,7 +2460,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)
{
if (req_ref_put_and_test(req)) {
req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
}
@@ -2744,7 +2747,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
return;
req->result = res;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, true);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
@@ -2986,7 +2989,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
req_set_fail(req);
req->result = ret;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
}
}
@@ -5309,7 +5312,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return 1;
}
@@ -5972,7 +5975,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
req->result = -ETIME;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -6947,7 +6950,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -8662,6 +8665,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
+ INIT_WQ_LIST(&tctx->prior_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
--
2.25.1
next prev parent reply other threads:[~2021-12-07 9:40 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-12-07 9:39 [PATCH v7 0/5] task optimization Hao Xu
2021-12-07 9:39 ` [PATCH 1/5] io-wq: add helper to merge two wq_lists Hao Xu
2021-12-07 9:39 ` Hao Xu [this message]
2021-12-07 9:39 ` [PATCH 3/5] io_uring: add helper for task work execution code Hao Xu
2021-12-07 9:39 ` [PATCH 4/5] io_uring: split io_req_complete_post() and add a helper Hao Xu
2021-12-07 9:39 ` [PATCH 5/5] io_uring: batch completion in prior_task_list Hao Xu
2021-12-07 21:01 ` Pavel Begunkov
2021-12-07 21:16 ` Pavel Begunkov
2021-12-08 5:04 ` Hao Xu
2021-12-08 5:08 ` Hao Xu
2021-12-07 21:59 ` Jens Axboe
2021-12-08 5:23 ` Hao Xu
2021-12-07 11:18 ` [PATCH v7 0/5] task optimization Hao Xu
2021-12-07 16:48 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox