From: Jens Axboe <[email protected]>
To: Nadav Amit <[email protected]>
Cc: [email protected], Hao Xu <[email protected]>
Subject: Re: Race between io_wqe_worker() and io_wqe_wake_worker()
Date: Tue, 3 Aug 2021 13:53:48 -0600 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 8/3/21 1:24 PM, Jens Axboe wrote:
> On 8/3/21 1:20 PM, Nadav Amit wrote:
>>
>>
>>> On Aug 3, 2021, at 11:14 AM, Jens Axboe <[email protected]> wrote:
>>>
>>> On 8/3/21 12:04 PM, Nadav Amit wrote:
>>>>
>>>>
>>>> Thanks for the quick response.
>>>>
>>>> I tried you version. It works better, but my workload still gets stuck
>>>> occasionally (less frequently though). It is pretty obvious that the
>>>> version you sent still has a race, so I didn’t put the effort into
>>>> debugging it.
>>>
>>> All good, thanks for testing! Is it a test case you can share? Would
>>> help with confidence in the final solution.
>>
>> Unfortunately no, since it is an entire WIP project that I am working
>> on (with undetermined license at this point). But I will be happy to
>> test any solution that you provide.
>
> OK no worries, I'll see if I can tighten this up. I don't particularly
> hate your solution, it would just be nice to avoid creating a new worker
> if we can just keep running the current one.
>
> I'll toss something your way in a bit...
How about this? I think this largely stems from the fact that we only
do a partial running decrement on exit. Left the previous checks in
place as well, as it will reduce the amount of times that we do need
to hit that case.
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cf086b01c6c6..f072995d382b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -35,12 +35,17 @@ enum {
IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
};
+enum {
+ IO_WORKER_EXITING = 0, /* worker is exiting */
+};
+
/*
* One for each thread in a wqe pool
*/
struct io_worker {
refcount_t ref;
unsigned flags;
+ unsigned long state;
struct hlist_nulls_node nulls_node;
struct list_head all_list;
struct task_struct *task;
@@ -130,6 +135,7 @@ struct io_cb_cancel_data {
};
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
+static void io_wqe_dec_running(struct io_worker *worker);
static bool io_worker_get(struct io_worker *worker)
{
@@ -168,26 +174,21 @@ static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- unsigned flags;
if (refcount_dec_and_test(&worker->ref))
complete(&worker->ref_done);
wait_for_completion(&worker->ref_done);
- preempt_disable();
- current->flags &= ~PF_IO_WORKER;
- flags = worker->flags;
- worker->flags = 0;
- if (flags & IO_WORKER_F_RUNNING)
- atomic_dec(&acct->nr_running);
- worker->flags = 0;
- preempt_enable();
-
raw_spin_lock_irq(&wqe->lock);
- if (flags & IO_WORKER_F_FREE)
+ if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
acct->nr_workers--;
+ preempt_disable();
+ io_wqe_dec_running(worker);
+ worker->flags = 0;
+ current->flags &= ~PF_IO_WORKER;
+ preempt_enable();
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
@@ -214,15 +215,20 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
struct hlist_nulls_node *n;
struct io_worker *worker;
- n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
- if (is_a_nulls(n))
- return false;
-
- worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
- if (io_worker_get(worker)) {
- wake_up_process(worker->task);
+ /*
+ * Iterate free_list and see if we can find an idle worker to
+ * activate. If a given worker is on the free_list but in the process
+ * of exiting, keep trying.
+ */
+ hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+ if (!io_worker_get(worker))
+ continue;
+ if (!test_bit(IO_WORKER_EXITING, &worker->state)) {
+ wake_up_process(worker->task);
+ io_worker_release(worker);
+ return true;
+ }
io_worker_release(worker);
- return true;
}
return false;
@@ -560,8 +566,17 @@ static int io_wqe_worker(void *data)
if (ret)
continue;
/* timed out, exit unless we're the fixed worker */
- if (!(worker->flags & IO_WORKER_F_FIXED))
+ if (!(worker->flags & IO_WORKER_F_FIXED)) {
+ /*
+ * Someone elevated our refs, which could be trying
+ * to re-activate for work. Loop one more time for
+ * that case.
+ */
+ if (refcount_read(&worker->ref) != 1)
+ continue;
+ set_bit(IO_WORKER_EXITING, &worker->state);
break;
+ }
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
--
Jens Axboe
next prev parent reply other threads:[~2021-08-03 19:53 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-08-03 1:05 Race between io_wqe_worker() and io_wqe_wake_worker() Nadav Amit
2021-08-03 13:22 ` Jens Axboe
2021-08-03 14:37 ` Jens Axboe
2021-08-03 17:25 ` Hao Xu
2021-08-03 18:04 ` Nadav Amit
2021-08-03 18:14 ` Jens Axboe
2021-08-03 19:20 ` Nadav Amit
2021-08-03 19:24 ` Jens Axboe
2021-08-03 19:53 ` Jens Axboe [this message]
2021-08-03 21:16 ` Nadav Amit
2021-08-03 21:25 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox