[PATCH 3/8] io_uring: add a limited tw list for irq completion work

public inbox for [email protected]
 help / color / mirror / Atom feed

From: Hao Xu <[email protected]>
To: Jens Axboe <[email protected]>
Cc: [email protected], Pavel Begunkov <[email protected]>,
	Joseph Qi <[email protected]>
Subject: [PATCH 3/8] io_uring: add a limited tw list for irq completion work
Date: Mon, 27 Sep 2021 18:51:18 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

Now we have a lot of task_work users, some are just to complete a req
and generate a cqe. Let's put the work to a new tw list which has a
higher priority, so that it can be handled quickly and thus to reduce
avg req latency. an explanatory case:

origin timeline:
    submit_sqe-->irq-->add completion task_work
    -->run heavy work0~n-->run completion task_work
now timeline:
    submit_sqe-->irq-->add completion task_work
    -->run completion task_work-->run heavy work0~n

One thing to watch out is sometimes irq completion TWs comes
overwhelmingly, which makes the new tw list grows fast, and TWs in
the old list are starved. So we have to limit the length of the new
tw list. A practical value is 1/3:
    len of new tw list < 1/3 * (len of new + old tw list)

In this way, the new tw list has a limited length and normal task get
there chance to run.

Tested this patch(and the following ones) by manually replace
__io_queue_sqe() to io_req_task_complete() to construct 'heavy' task
works. Then test with fio:

ioengine=io_uring
thread=1
bs=4k
direct=1
rw=randread
time_based=1
runtime=600
randrepeat=0
group_reporting=1
filename=/dev/nvme0n1

Tried various iodepth.
The peak IOPS for this patch is 314K, while the old one is 249K.
For avg latency, difference shows when iodepth grow:
depth and avg latency(usec):
	depth      new          old
	 1        22.80        23.77
	 2        23.48        24.54
	 4        24.26        25.57
	 8        29.21        32.89
	 16       53.61        63.50
	 32       106.29       131.34
	 64       217.21       256.33
	 128      421.59       513.87
	 256      815.15       1050.99

95%, 99% etc more data in cover letter.

Signed-off-by: Hao Xu <[email protected]>
---
 fs/io_uring.c | 46 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8317c360f7a4..582ef7f55a35 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -461,6 +461,7 @@ struct io_ring_ctx {
 	};
 };
 
+#define MAX_EMERGENCY_TW_RATIO	3
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -475,6 +476,9 @@ struct io_uring_task {
 	spinlock_t		task_lock;
 	struct io_wq_work_list	task_list;
 	struct callback_head	task_work;
+	struct io_wq_work_list	prior_task_list;
+	unsigned int		nr;
+	unsigned int		prior_nr;
 	bool			task_running;
 };
 
@@ -2131,13 +2135,18 @@ static void tctx_task_work(struct callback_head *cb)
 
 	while (1) {
 		struct io_wq_work_node *node;
+		struct io_wq_work_list *merged_list;
 
-		if (!tctx->task_list.first && locked)
+		if (!tctx->prior_task_list.first &&
+		    !tctx->task_list.first && locked)
 			io_submit_flush_completions(ctx);
 
 		spin_lock_irq(&tctx->task_lock);
-		node = tctx->task_list.first;
+		merged_list = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
+		node = merged_list->first;
 		INIT_WQ_LIST(&tctx->task_list);
+		INIT_WQ_LIST(&tctx->prior_task_list);
+		tctx->nr = tctx->prior_nr = 0;
 		if (!node)
 			tctx->task_running = false;
 		spin_unlock_irq(&tctx->task_lock);
@@ -2166,19 +2175,26 @@ static void tctx_task_work(struct callback_head *cb)
 	ctx_flush_and_put(ctx, &locked);
 }
 
-static void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_task_work_add(struct io_kiocb *req, bool emergency)
 {
 	struct task_struct *tsk = req->task;
 	struct io_uring_task *tctx = tsk->io_uring;
 	enum task_work_notify_mode notify;
 	struct io_wq_work_node *node;
+	struct io_wq_work_list *merged_list;
 	unsigned long flags;
 	bool running;
 
 	WARN_ON_ONCE(!tctx);
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+	if (emergency && tctx->prior_nr * MAX_EMERGENCY_TW_RATIO < tctx->nr) {
+		wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
+		tctx->prior_nr++;
+	} else {
+		wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+	}
+	tctx->nr++;
 	running = tctx->task_running;
 	if (!running)
 		tctx->task_running = true;
@@ -2202,9 +2218,12 @@ static void io_req_task_work_add(struct io_kiocb *req)
 	}
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
+	tctx->nr = tctx->prior_nr = 0;
 	tctx->task_running = false;
-	node = tctx->task_list.first;
+	merged_list = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
+	node = merged_list->first;
 	INIT_WQ_LIST(&tctx->task_list);
+	INIT_WQ_LIST(&tctx->prior_task_list);
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	while (node) {
@@ -2241,19 +2260,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 {
 	req->result = ret;
 	req->io_task_work.func = io_req_task_cancel;
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, true);
 }
 
 static void io_req_task_queue(struct io_kiocb *req)
 {
 	req->io_task_work.func = io_req_task_submit;
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, false);
 }
 
 static void io_req_task_queue_reissue(struct io_kiocb *req)
 {
 	req->io_task_work.func = io_queue_async_work;
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, false);
 }
 
 static inline void io_queue_next(struct io_kiocb *req)
@@ -2373,7 +2392,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)
 {
 	if (req_ref_put_and_test(req)) {
 		req->io_task_work.func = io_free_req_work;
-		io_req_task_work_add(req);
+		io_req_task_work_add(req, false);
 	}
 }
 
@@ -2693,7 +2712,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 		return;
 	req->result = res;
 	req->io_task_work.func = io_req_task_complete;
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, true);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -5256,7 +5275,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	 * of executing it. We can't safely execute it anyway, as we may not
 	 * have the needed state needed for it anyway.
 	 */
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, false);
 	return 1;
 }
 
@@ -5934,7 +5953,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 
 	req->io_task_work.func = io_req_task_timeout;
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, false);
 	return HRTIMER_NORESTART;
 }
 
@@ -6916,7 +6935,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 
 	req->io_task_work.func = io_req_task_link_timeout;
-	io_req_task_work_add(req);
+	io_req_task_work_add(req, false);
 	return HRTIMER_NORESTART;
 }
 
@@ -8543,6 +8562,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
+	INIT_WQ_LIST(&tctx->prior_task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
-- 
2.24.4

next prev parent reply	other threads:[~2021-09-27 10:51 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-27 10:51 [PATCH v2 0/8] task_work optimization Hao Xu
2021-09-27 10:51 ` [PATCH 1/8] io-wq: code clean for io_wq_add_work_after() Hao Xu
2021-09-27 10:51 ` [PATCH 2/8] io-wq: add helper to merge two wq_lists Hao Xu
2021-09-27 10:51 ` Hao Xu [this message]
2021-09-29 12:31   ` [PATCH 3/8] io_uring: add a limited tw list for irq completion work Hao Xu
2021-09-27 10:51 ` [PATCH 4/8] io_uring: add helper for task work execution code Hao Xu
2021-09-27 10:51 ` [PATCH 5/8] io_uring: split io_req_complete_post() and add a helper Hao Xu
2021-09-27 10:51 ` [PATCH 6/8] io_uring: move up io_put_kbuf() and io_put_rw_kbuf() Hao Xu
2021-09-27 10:51 ` [PATCH 7/8] io_uring: add tw_ctx for io_uring_task Hao Xu
2021-09-27 10:51 ` [PATCH 8/8] io_uring: batch completion in prior_task_list Hao Xu
  -- strict thread matches above, loose matches on Subject: below --
2021-09-27  6:17 [PATCH 0/6] task_work optimization Hao Xu
2021-09-27  6:17 ` [PATCH 3/8] io_uring: add a limited tw list for irq completion work Hao Xu
2021-09-28 11:29   ` Pavel Begunkov
2021-09-28 16:55     ` Hao Xu
2021-09-29 11:25       ` Pavel Begunkov
2021-09-29 11:38     ` Hao Xu
2021-09-30  9:02       ` Pavel Begunkov
2021-09-30  3:21     ` Hao Xu

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:8317c360f7a dfblob:582ef7f55a3 )
 OR (
bs:"[PATCH 3/8] io_uring: add a limited tw list for irq completion work" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox