public inbox for [email protected]
 help / color / mirror / Atom feed
From: Hao Xu <[email protected]>
To: Jens Axboe <[email protected]>
Cc: [email protected], Pavel Begunkov <[email protected]>,
	Joseph Qi <[email protected]>
Subject: Re: [PATCH 3/8] io_uring: add a limited tw list for irq completion work
Date: Wed, 29 Sep 2021 20:31:00 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

在 2021/9/27 下午6:51, Hao Xu 写道:
> Now we have a lot of task_work users, some are just to complete a req
> and generate a cqe. Let's put the work to a new tw list which has a
> higher priority, so that it can be handled quickly and thus to reduce
> avg req latency. an explanatory case:
> 
> origin timeline:
>      submit_sqe-->irq-->add completion task_work
>      -->run heavy work0~n-->run completion task_work
> now timeline:
>      submit_sqe-->irq-->add completion task_work
>      -->run completion task_work-->run heavy work0~n
> 
> One thing to watch out is sometimes irq completion TWs comes
> overwhelmingly, which makes the new tw list grows fast, and TWs in
> the old list are starved. So we have to limit the length of the new
> tw list. A practical value is 1/3:
>      len of new tw list < 1/3 * (len of new + old tw list)
> 
> In this way, the new tw list has a limited length and normal task get
> there chance to run.
> 
> Tested this patch(and the following ones) by manually replace
> __io_queue_sqe() to io_req_task_complete() to construct 'heavy' task
> works. Then test with fio:
> 
> ioengine=io_uring
> thread=1
> bs=4k
> direct=1
> rw=randread
> time_based=1
> runtime=600
> randrepeat=0
> group_reporting=1
> filename=/dev/nvme0n1
> 
> Tried various iodepth.
> The peak IOPS for this patch is 314K, while the old one is 249K.
> For avg latency, difference shows when iodepth grow:
> depth and avg latency(usec):
> 	depth      new          old
> 	 1        22.80        23.77
> 	 2        23.48        24.54
> 	 4        24.26        25.57
> 	 8        29.21        32.89
> 	 16       53.61        63.50
> 	 32       106.29       131.34
> 	 64       217.21       256.33
> 	 128      421.59       513.87
> 	 256      815.15       1050.99
> 
> 95%, 99% etc more data in cover letter.
> 
> Signed-off-by: Hao Xu <[email protected]>
> ---
>   fs/io_uring.c | 46 +++++++++++++++++++++++++++++++++-------------
>   1 file changed, 33 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 8317c360f7a4..582ef7f55a35 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -461,6 +461,7 @@ struct io_ring_ctx {
>   	};
>   };
>   
> +#define MAX_EMERGENCY_TW_RATIO	3
>   struct io_uring_task {
>   	/* submission side */
>   	int			cached_refs;
> @@ -475,6 +476,9 @@ struct io_uring_task {
>   	spinlock_t		task_lock;
>   	struct io_wq_work_list	task_list;
>   	struct callback_head	task_work;
> +	struct io_wq_work_list	prior_task_list;
> +	unsigned int		nr;
> +	unsigned int		prior_nr;
>   	bool			task_running;
>   };
>   
> @@ -2131,13 +2135,18 @@ static void tctx_task_work(struct callback_head *cb)
>   
>   	while (1) {
>   		struct io_wq_work_node *node;
> +		struct io_wq_work_list *merged_list;
>   
> -		if (!tctx->task_list.first && locked)
> +		if (!tctx->prior_task_list.first &&
> +		    !tctx->task_list.first && locked)
>   			io_submit_flush_completions(ctx);
>   
>   		spin_lock_irq(&tctx->task_lock);
> -		node = tctx->task_list.first;
> +		merged_list = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
> +		node = merged_list->first;
>   		INIT_WQ_LIST(&tctx->task_list);
> +		INIT_WQ_LIST(&tctx->prior_task_list);
> +		tctx->nr = tctx->prior_nr = 0;
>   		if (!node)
>   			tctx->task_running = false;
>   		spin_unlock_irq(&tctx->task_lock);
> @@ -2166,19 +2175,26 @@ static void tctx_task_work(struct callback_head *cb)
>   	ctx_flush_and_put(ctx, &locked);
>   }
>   
> -static void io_req_task_work_add(struct io_kiocb *req)
> +static void io_req_task_work_add(struct io_kiocb *req, bool emergency)
>   {
>   	struct task_struct *tsk = req->task;
>   	struct io_uring_task *tctx = tsk->io_uring;
>   	enum task_work_notify_mode notify;
>   	struct io_wq_work_node *node;
> +	struct io_wq_work_list *merged_list;
>   	unsigned long flags;
>   	bool running;
>   
>   	WARN_ON_ONCE(!tctx);
>   
>   	spin_lock_irqsave(&tctx->task_lock, flags);
> -	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
> +	if (emergency && tctx->prior_nr * MAX_EMERGENCY_TW_RATIO < tctx->nr) {
                                       this definitely should be <= to
avoid inverted completion TWs.. will update it in next version.
> +		wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
> +		tctx->prior_nr++;
> +	} else {
> +		wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
> +	}
> +	tctx->nr++;
>   	running = tctx->task_running;
>   	if (!running)
>   		tctx->task_running = true;
> @@ -2202,9 +2218,12 @@ static void io_req_task_work_add(struct io_kiocb *req)
>   	}
>   
>   	spin_lock_irqsave(&tctx->task_lock, flags);
> +	tctx->nr = tctx->prior_nr = 0;
>   	tctx->task_running = false;
> -	node = tctx->task_list.first;
> +	merged_list = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
> +	node = merged_list->first;
>   	INIT_WQ_LIST(&tctx->task_list);
> +	INIT_WQ_LIST(&tctx->prior_task_list);
>   	spin_unlock_irqrestore(&tctx->task_lock, flags);
>   
>   	while (node) {
> @@ -2241,19 +2260,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
>   {
>   	req->result = ret;
>   	req->io_task_work.func = io_req_task_cancel;
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, true);
>   }
>   
>   static void io_req_task_queue(struct io_kiocb *req)
>   {
>   	req->io_task_work.func = io_req_task_submit;
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, false);
>   }
>   
>   static void io_req_task_queue_reissue(struct io_kiocb *req)
>   {
>   	req->io_task_work.func = io_queue_async_work;
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, false);
>   }
>   
>   static inline void io_queue_next(struct io_kiocb *req)
> @@ -2373,7 +2392,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)
>   {
>   	if (req_ref_put_and_test(req)) {
>   		req->io_task_work.func = io_free_req_work;
> -		io_req_task_work_add(req);
> +		io_req_task_work_add(req, false);
>   	}
>   }
>   
> @@ -2693,7 +2712,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
>   		return;
>   	req->result = res;
>   	req->io_task_work.func = io_req_task_complete;
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, true);
>   }
>   
>   static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
> @@ -5256,7 +5275,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
>   	 * of executing it. We can't safely execute it anyway, as we may not
>   	 * have the needed state needed for it anyway.
>   	 */
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, false);
>   	return 1;
>   }
>   
> @@ -5934,7 +5953,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
>   	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
>   
>   	req->io_task_work.func = io_req_task_timeout;
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, false);
>   	return HRTIMER_NORESTART;
>   }
>   
> @@ -6916,7 +6935,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
>   	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
>   
>   	req->io_task_work.func = io_req_task_link_timeout;
> -	io_req_task_work_add(req);
> +	io_req_task_work_add(req, false);
>   	return HRTIMER_NORESTART;
>   }
>   
> @@ -8543,6 +8562,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
>   	task->io_uring = tctx;
>   	spin_lock_init(&tctx->task_lock);
>   	INIT_WQ_LIST(&tctx->task_list);
> +	INIT_WQ_LIST(&tctx->prior_task_list);
>   	init_task_work(&tctx->task_work, tctx_task_work);
>   	return 0;
>   }
> 


  reply	other threads:[~2021-09-29 12:31 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-27 10:51 [PATCH v2 0/8] task_work optimization Hao Xu
2021-09-27 10:51 ` [PATCH 1/8] io-wq: code clean for io_wq_add_work_after() Hao Xu
2021-09-27 10:51 ` [PATCH 2/8] io-wq: add helper to merge two wq_lists Hao Xu
2021-09-27 10:51 ` [PATCH 3/8] io_uring: add a limited tw list for irq completion work Hao Xu
2021-09-29 12:31   ` Hao Xu [this message]
2021-09-27 10:51 ` [PATCH 4/8] io_uring: add helper for task work execution code Hao Xu
2021-09-27 10:51 ` [PATCH 5/8] io_uring: split io_req_complete_post() and add a helper Hao Xu
2021-09-27 10:51 ` [PATCH 6/8] io_uring: move up io_put_kbuf() and io_put_rw_kbuf() Hao Xu
2021-09-27 10:51 ` [PATCH 7/8] io_uring: add tw_ctx for io_uring_task Hao Xu
2021-09-27 10:51 ` [PATCH 8/8] io_uring: batch completion in prior_task_list Hao Xu
  -- strict thread matches above, loose matches on Subject: below --
2021-09-27  6:17 [PATCH 0/6] task_work optimization Hao Xu
2021-09-27  6:17 ` [PATCH 3/8] io_uring: add a limited tw list for irq completion work Hao Xu
2021-09-28 11:29   ` Pavel Begunkov
2021-09-28 16:55     ` Hao Xu
2021-09-29 11:25       ` Pavel Begunkov
2021-09-29 11:38     ` Hao Xu
2021-09-30  9:02       ` Pavel Begunkov
2021-09-30  3:21     ` Hao Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6c6f1f1c-e1f5-7144-f3d1-c368ecbfc531@linux.alibaba.com \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox