public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
@ 2021-08-04 14:43 Jens Axboe
  2021-08-04 15:33 ` Daniel Wagner
  2021-08-05  9:17 ` Peter Zijlstra
  0 siblings, 2 replies; 4+ messages in thread
From: Jens Axboe @ 2021-08-04 14:43 UTC (permalink / raw)
  To: io-uring, LKML; +Cc: Daniel Wagner, Peter Zijlstra

Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
stress-ng:

| [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
| [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [   90.202561] Call Trace:
| [   90.202577]  dump_stack_lvl+0x34/0x44
| [   90.202584]  ___might_sleep.cold+0x87/0x94
| [   90.202588]  rt_spin_lock+0x19/0x70
| [   90.202593]  ___slab_alloc+0xcb/0x7d0
| [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
| [   90.202603]  ? dequeue_entity+0xc3/0x290
| [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202610]  ? pick_next_task_fair+0xb9/0x330
| [   90.202612]  ? __schedule+0x670/0x1410
| [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
| [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202625]  io_wq_worker_sleeping+0x37/0x50
| [   90.202628]  schedule+0x30/0xd0
| [   90.202630]  schedule_timeout+0x8f/0x1a0
| [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
| [   90.202637]  io_wqe_worker+0xfd/0x320
| [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
| [   90.202644]  ? io_worker_handle_work+0x670/0x670
| [   90.202646]  ? io_worker_handle_work+0x670/0x670
| [   90.202649]  ret_from_fork+0x22/0x30

which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
a raw spinlock. Besides that not working on RT, doing any kind of
allocation from inside schedule() is kind of nasty and should be avoided
if at all possible.

This particular path happens when an io-wq worker goes to sleep, and we
need a new worker to handle pending work. We currently allocate a small
data item to hold the information we need to create a new worker, but we
can instead include this data in the io_worker struct itself and just
protect it with a single bit lock. We only really need one per worker
anyway, as we will have run pending work between to sleep cycles.

https://lore.kernel.org/lkml/[email protected]/
Reported-by: Daniel Wagner <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>

---

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 50dc93ffc153..d5f56d41d59b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -261,42 +265,46 @@ static void io_wqe_inc_running(struct io_worker *worker)
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wq = cwd->wqe->wq;
-	create_io_worker(wq, cwd->wqe, cwd->index);
-	kfree(cwd);
+	worker = container_of(cb, struct io_worker, create_work);
+	wq = worker->wqe->wq;
+	create_io_worker(wq, worker->wqe, worker->create_index);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+				   struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	if (!io_worker_get(worker))
+		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail_release;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
 
-		kfree(cwd);
-	}
+	clear_bit_unlock(0, &worker->create_state);
+fail_release:
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -314,7 +322,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -973,12 +981,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -995,12 +1003,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
  2021-08-04 14:43 [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path Jens Axboe
@ 2021-08-04 15:33 ` Daniel Wagner
  2021-08-04 15:39   ` Jens Axboe
  2021-08-05  9:17 ` Peter Zijlstra
  1 sibling, 1 reply; 4+ messages in thread
From: Daniel Wagner @ 2021-08-04 15:33 UTC (permalink / raw)
  To: Jens Axboe
  Cc: io-uring, LKML, Peter Zijlstra, linux-rt-users,
	Sebastian Andrzej Siewior

On Wed, Aug 04, 2021 at 08:43:43AM -0600, Jens Axboe wrote:
> Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
> stress-ng:
> 
> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
> | [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
> | [   90.202561] Call Trace:
> | [   90.202577]  dump_stack_lvl+0x34/0x44
> | [   90.202584]  ___might_sleep.cold+0x87/0x94
> | [   90.202588]  rt_spin_lock+0x19/0x70
> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
> | [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
> | [   90.202603]  ? dequeue_entity+0xc3/0x290
> | [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202610]  ? pick_next_task_fair+0xb9/0x330
> | [   90.202612]  ? __schedule+0x670/0x1410
> | [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
> | [   90.202628]  schedule+0x30/0xd0
> | [   90.202630]  schedule_timeout+0x8f/0x1a0
> | [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
> | [   90.202637]  io_wqe_worker+0xfd/0x320
> | [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
> | [   90.202644]  ? io_worker_handle_work+0x670/0x670
> | [   90.202646]  ? io_worker_handle_work+0x670/0x670
> | [   90.202649]  ret_from_fork+0x22/0x30
> 
> which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
> a raw spinlock. Besides that not working on RT, doing any kind of
> allocation from inside schedule() is kind of nasty and should be avoided
> if at all possible.
> 
> This particular path happens when an io-wq worker goes to sleep, and we
> need a new worker to handle pending work. We currently allocate a small
> data item to hold the information we need to create a new worker, but we
> can instead include this data in the io_worker struct itself and just
> protect it with a single bit lock. We only really need one per worker
> anyway, as we will have run pending work between to sleep cycles.
> 
> https://lore.kernel.org/lkml/[email protected]/
> Reported-by: Daniel Wagner <[email protected]>
> Signed-off-by: Jens Axboe <[email protected]>

I applied this patch on top of v5.14-rc4-rt4 and with it all looks
good.

Tested-by: Daniel Wagner <[email protected]>


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
  2021-08-04 15:33 ` Daniel Wagner
@ 2021-08-04 15:39   ` Jens Axboe
  0 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2021-08-04 15:39 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: io-uring, LKML, Peter Zijlstra, linux-rt-users,
	Sebastian Andrzej Siewior

On 8/4/21 9:33 AM, Daniel Wagner wrote:
> On Wed, Aug 04, 2021 at 08:43:43AM -0600, Jens Axboe wrote:
>> Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
>> stress-ng:
>>
>> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
>> | [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
>> | [   90.202561] Call Trace:
>> | [   90.202577]  dump_stack_lvl+0x34/0x44
>> | [   90.202584]  ___might_sleep.cold+0x87/0x94
>> | [   90.202588]  rt_spin_lock+0x19/0x70
>> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
>> | [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
>> | [   90.202603]  ? dequeue_entity+0xc3/0x290
>> | [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
>> | [   90.202610]  ? pick_next_task_fair+0xb9/0x330
>> | [   90.202612]  ? __schedule+0x670/0x1410
>> | [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
>> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
>> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
>> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
>> | [   90.202628]  schedule+0x30/0xd0
>> | [   90.202630]  schedule_timeout+0x8f/0x1a0
>> | [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
>> | [   90.202637]  io_wqe_worker+0xfd/0x320
>> | [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
>> | [   90.202644]  ? io_worker_handle_work+0x670/0x670
>> | [   90.202646]  ? io_worker_handle_work+0x670/0x670
>> | [   90.202649]  ret_from_fork+0x22/0x30
>>
>> which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
>> a raw spinlock. Besides that not working on RT, doing any kind of
>> allocation from inside schedule() is kind of nasty and should be avoided
>> if at all possible.
>>
>> This particular path happens when an io-wq worker goes to sleep, and we
>> need a new worker to handle pending work. We currently allocate a small
>> data item to hold the information we need to create a new worker, but we
>> can instead include this data in the io_worker struct itself and just
>> protect it with a single bit lock. We only really need one per worker
>> anyway, as we will have run pending work between to sleep cycles.
>>
>> https://lore.kernel.org/lkml/[email protected]/
>> Reported-by: Daniel Wagner <[email protected]>
>> Signed-off-by: Jens Axboe <[email protected]>
> 
> I applied this patch on top of v5.14-rc4-rt4 and with it all looks
> good.
> 
> Tested-by: Daniel Wagner <[email protected]>

Great, thanks for testing!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
  2021-08-04 14:43 [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path Jens Axboe
  2021-08-04 15:33 ` Daniel Wagner
@ 2021-08-05  9:17 ` Peter Zijlstra
  1 sibling, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2021-08-05  9:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, LKML, Daniel Wagner

On Wed, Aug 04, 2021 at 08:43:43AM -0600, Jens Axboe wrote:
> Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
> stress-ng:
> 
> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
> | [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
> | [   90.202561] Call Trace:
> | [   90.202577]  dump_stack_lvl+0x34/0x44
> | [   90.202584]  ___might_sleep.cold+0x87/0x94
> | [   90.202588]  rt_spin_lock+0x19/0x70
> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
> | [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
> | [   90.202603]  ? dequeue_entity+0xc3/0x290
> | [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202610]  ? pick_next_task_fair+0xb9/0x330
> | [   90.202612]  ? __schedule+0x670/0x1410
> | [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
> | [   90.202628]  schedule+0x30/0xd0
> | [   90.202630]  schedule_timeout+0x8f/0x1a0
> | [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
> | [   90.202637]  io_wqe_worker+0xfd/0x320
> | [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
> | [   90.202644]  ? io_worker_handle_work+0x670/0x670
> | [   90.202646]  ? io_worker_handle_work+0x670/0x670
> | [   90.202649]  ret_from_fork+0x22/0x30
> 
> which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
> a raw spinlock. Besides that not working on RT, doing any kind of
> allocation from inside schedule() is kind of nasty and should be avoided
> if at all possible.
> 
> This particular path happens when an io-wq worker goes to sleep, and we
> need a new worker to handle pending work. We currently allocate a small
> data item to hold the information we need to create a new worker, but we
> can instead include this data in the io_worker struct itself and just
> protect it with a single bit lock. We only really need one per worker
> anyway, as we will have run pending work between to sleep cycles.
> 
> https://lore.kernel.org/lkml/[email protected]/
> Reported-by: Daniel Wagner <[email protected]>
> Signed-off-by: Jens Axboe <[email protected]>

Thanks!

Acked-by: Peter Zijlstra (Intel) <[email protected]>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-08-05  9:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-04 14:43 [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path Jens Axboe
2021-08-04 15:33 ` Daniel Wagner
2021-08-04 15:39   ` Jens Axboe
2021-08-05  9:17 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox