public inbox for [email protected]
 help / color / mirror / Atom feed
From: Jens Axboe <[email protected]>
To: Nadav Amit <[email protected]>
Cc: [email protected], Hao Xu <[email protected]>
Subject: Re: Race between io_wqe_worker() and io_wqe_wake_worker()
Date: Tue, 3 Aug 2021 13:53:48 -0600	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

On 8/3/21 1:24 PM, Jens Axboe wrote:
> On 8/3/21 1:20 PM, Nadav Amit wrote:
>>
>>
>>> On Aug 3, 2021, at 11:14 AM, Jens Axboe <[email protected]> wrote:
>>>
>>> On 8/3/21 12:04 PM, Nadav Amit wrote:
>>>>
>>>>
>>>> Thanks for the quick response.
>>>>
>>>> I tried you version. It works better, but my workload still gets stuck
>>>> occasionally (less frequently though). It is pretty obvious that the
>>>> version you sent still has a race, so I didn’t put the effort into
>>>> debugging it.
>>>
>>> All good, thanks for testing! Is it a test case you can share? Would
>>> help with confidence in the final solution.
>>
>> Unfortunately no, since it is an entire WIP project that I am working
>> on (with undetermined license at this point). But I will be happy to
>> test any solution that you provide.
> 
> OK no worries, I'll see if I can tighten this up. I don't particularly
> hate your solution, it would just be nice to avoid creating a new worker
> if we can just keep running the current one.
> 
> I'll toss something your way in a bit...

How about this? I think this largely stems from the fact that we only
do a partial running decrement on exit. Left the previous checks in
place as well, as it will reduce the amount of times that we do need
to hit that case.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index cf086b01c6c6..f072995d382b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -35,12 +35,17 @@ enum {
 	IO_WQE_FLAG_STALLED	= 1,	/* stalled on hash */
 };
 
+enum {
+	IO_WORKER_EXITING	= 0,	/* worker is exiting */
+};
+
 /*
  * One for each thread in a wqe pool
  */
 struct io_worker {
 	refcount_t ref;
 	unsigned flags;
+	unsigned long state;
 	struct hlist_nulls_node nulls_node;
 	struct list_head all_list;
 	struct task_struct *task;
@@ -130,6 +135,7 @@ struct io_cb_cancel_data {
 };
 
 static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
+static void io_wqe_dec_running(struct io_worker *worker);
 
 static bool io_worker_get(struct io_worker *worker)
 {
@@ -168,26 +174,21 @@ static void io_worker_exit(struct io_worker *worker)
 {
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-	unsigned flags;
 
 	if (refcount_dec_and_test(&worker->ref))
 		complete(&worker->ref_done);
 	wait_for_completion(&worker->ref_done);
 
-	preempt_disable();
-	current->flags &= ~PF_IO_WORKER;
-	flags = worker->flags;
-	worker->flags = 0;
-	if (flags & IO_WORKER_F_RUNNING)
-		atomic_dec(&acct->nr_running);
-	worker->flags = 0;
-	preempt_enable();
-
 	raw_spin_lock_irq(&wqe->lock);
-	if (flags & IO_WORKER_F_FREE)
+	if (worker->flags & IO_WORKER_F_FREE)
 		hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
 	acct->nr_workers--;
+	preempt_disable();
+	io_wqe_dec_running(worker);
+	worker->flags = 0;
+	current->flags &= ~PF_IO_WORKER;
+	preempt_enable();
 	raw_spin_unlock_irq(&wqe->lock);
 
 	kfree_rcu(worker, rcu);
@@ -214,15 +215,20 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 	struct hlist_nulls_node *n;
 	struct io_worker *worker;
 
-	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
-	if (is_a_nulls(n))
-		return false;
-
-	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
-	if (io_worker_get(worker)) {
-		wake_up_process(worker->task);
+	/*
+	 * Iterate free_list and see if we can find an idle worker to
+	 * activate. If a given worker is on the free_list but in the process
+	 * of exiting, keep trying.
+	 */
+	hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+		if (!io_worker_get(worker))
+			continue;
+		if (!test_bit(IO_WORKER_EXITING, &worker->state)) {
+			wake_up_process(worker->task);
+			io_worker_release(worker);
+			return true;
+		}
 		io_worker_release(worker);
-		return true;
 	}
 
 	return false;
@@ -560,8 +566,17 @@ static int io_wqe_worker(void *data)
 		if (ret)
 			continue;
 		/* timed out, exit unless we're the fixed worker */
-		if (!(worker->flags & IO_WORKER_F_FIXED))
+		if (!(worker->flags & IO_WORKER_F_FIXED)) {
+			/*
+			 * Someone elevated our refs, which could be trying
+			 * to re-activate for work. Loop one more time for
+			 * that case.
+			 */
+			if (refcount_read(&worker->ref) != 1)
+				continue;
+			set_bit(IO_WORKER_EXITING, &worker->state);
 			break;
+		}
 	}
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {

-- 
Jens Axboe


  reply	other threads:[~2021-08-03 19:53 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-03  1:05 Race between io_wqe_worker() and io_wqe_wake_worker() Nadav Amit
2021-08-03 13:22 ` Jens Axboe
2021-08-03 14:37   ` Jens Axboe
2021-08-03 17:25     ` Hao Xu
2021-08-03 18:04     ` Nadav Amit
2021-08-03 18:14       ` Jens Axboe
2021-08-03 19:20         ` Nadav Amit
2021-08-03 19:24           ` Jens Axboe
2021-08-03 19:53             ` Jens Axboe [this message]
2021-08-03 21:16               ` Nadav Amit
2021-08-03 21:25                 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox