From: Jens Axboe <[email protected]>
To: [email protected]
Cc: Pavel Begunkov <[email protected]>, Jens Axboe <[email protected]>
Subject: [PATCH 08/18] io_uring: fix REQ_F_COMP_LOCKED by killing it
Date: Fri, 16 Oct 2020 10:02:14 -0600 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Pavel Begunkov <[email protected]>
REQ_F_COMP_LOCKED is used and implemented in a buggy way. The problem is
that the flag is set before io_put_req() but not cleared after, and if
that wasn't the final reference, the request will be freed with the flag
set from some other context, which may not hold a spinlock. That means
possible races with removing linked timeouts and unsynchronised
completion (e.g. access to CQ).
Instead of fixing REQ_F_COMP_LOCKED, kill the flag and use
task_work_add() to move such requests to a fresh context to free from
it, as was done with __io_free_req_finish().
Signed-off-by: Pavel Begunkov <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 149 +++++++++++++++++++++++---------------------------
1 file changed, 69 insertions(+), 80 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b76aecb3443d..19deb768ad07 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -574,7 +574,6 @@ enum {
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
- REQ_F_COMP_LOCKED_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
@@ -613,8 +612,6 @@ enum {
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* completion under lock */
- REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
@@ -963,8 +960,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
+static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_double_put_req(struct io_kiocb *req);
-static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -1316,9 +1313,8 @@ static void io_kill_timeout(struct io_kiocb *req)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
}
}
@@ -1369,8 +1365,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
if (link) {
__io_queue_linked_timeout(link);
/* drop submission reference */
- link->flags |= REQ_F_COMP_LOCKED;
- io_put_req(link);
+ io_put_req_deferred(link, 1);
}
kfree(de);
} while (!list_empty(&ctx->defer_list));
@@ -1597,13 +1592,19 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
list_del(&req->compl.list);
__io_cqring_fill_event(req, req->result, req->compl.cflags);
- if (!(req->flags & REQ_F_LINK_HEAD)) {
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- } else {
+
+ /*
+ * io_free_req() doesn't care about completion_lock unless one
+ * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
+ * because of a potential deadlock with req->work.fs->lock
+ */
+ if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
+ |REQ_F_WORK_INITIALIZED)) {
spin_unlock_irq(&ctx->completion_lock);
io_put_req(req);
spin_lock_irq(&ctx->completion_lock);
+ } else {
+ io_put_req(req);
}
}
io_commit_cqring(ctx);
@@ -1702,10 +1703,14 @@ static void io_dismantle_req(struct io_kiocb *req)
io_req_clean_work(req);
}
-static void __io_free_req_finish(struct io_kiocb *req)
+static void __io_free_req(struct io_kiocb *req)
{
- struct io_uring_task *tctx = req->task->io_uring;
- struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx;
+ struct io_ring_ctx *ctx;
+
+ io_dismantle_req(req);
+ tctx = req->task->io_uring;
+ ctx = req->ctx;
atomic_long_inc(&tctx->req_complete);
if (tctx->in_idle)
@@ -1719,33 +1724,6 @@ static void __io_free_req_finish(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
-static void io_req_task_file_table_put(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
-
- io_dismantle_req(req);
- __io_free_req_finish(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- io_dismantle_req(req);
- __io_free_req_finish(req);
- } else {
- int ret;
-
- init_task_work(&req->task_work, io_req_task_file_table_put);
- ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
- if (unlikely(ret)) {
- struct task_struct *tsk;
-
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, 0);
- }
- }
-}
-
static bool io_link_cancel_timeout(struct io_kiocb *req)
{
struct io_timeout_data *io = req->async_data;
@@ -1754,11 +1732,10 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
req->flags &= ~REQ_F_LINK_HEAD;
- io_put_req(req);
+ io_put_req_deferred(req, 1);
return true;
}
@@ -1785,17 +1762,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
bool wake_ev;
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- wake_ev = __io_kill_linked_timeout(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else {
- wake_ev = __io_kill_linked_timeout(req);
- }
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ wake_ev = __io_kill_linked_timeout(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (wake_ev)
io_cqring_ev_posted(ctx);
@@ -1835,27 +1807,29 @@ static void __io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
- link->flags |= REQ_F_COMP_LOCKED;
- __io_double_put_req(link);
+
+ /*
+ * It's ok to free under spinlock as they're not linked anymore,
+ * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
+ * work.fs->lock.
+ */
+ if (link->flags & REQ_F_WORK_INITIALIZED)
+ io_put_req_deferred(link, 2);
+ else
+ io_double_put_req(link);
}
io_commit_cqring(ctx);
- io_cqring_ev_posted(ctx);
}
static void io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
- if (!(req->flags & REQ_F_COMP_LOCKED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- __io_fail_links(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else {
- __io_fail_links(req);
- }
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ __io_fail_links(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
@@ -2069,6 +2043,34 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
+static void io_put_req_deferred_cb(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+ io_free_req(req);
+}
+
+static void io_free_req_deferred(struct io_kiocb *req)
+{
+ int ret;
+
+ init_task_work(&req->task_work, io_put_req_deferred_cb);
+ ret = io_req_task_work_add(req, true);
+ if (unlikely(ret)) {
+ struct task_struct *tsk;
+
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
+ }
+}
+
+static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
+{
+ if (refcount_sub_and_test(refs, &req->refs))
+ io_free_req_deferred(req);
+}
+
static struct io_wq_work *io_steal_work(struct io_kiocb *req)
{
struct io_kiocb *nxt;
@@ -2085,17 +2087,6 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
return nxt ? &nxt->work : NULL;
}
-/*
- * Must only be used if we don't need to care about links, usually from
- * within the completion handling itself.
- */
-static void __io_double_put_req(struct io_kiocb *req)
-{
- /* drop both submit and complete references */
- if (refcount_sub_and_test(2, &req->refs))
- __io_free_req(req);
-}
-
static void io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
@@ -5120,9 +5111,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
- req->flags |= REQ_F_COMP_LOCKED;
req_set_fail_links(req);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
}
return do_complete;
@@ -5304,9 +5294,8 @@ static int __io_timeout_cancel(struct io_kiocb *req)
list_del_init(&req->timeout.list);
req_set_fail_links(req);
- req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, -ECANCELED);
- io_put_req(req);
+ io_put_req_deferred(req, 1);
return 0;
}
--
2.28.0
next prev parent reply other threads:[~2020-10-16 16:02 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-16 16:02 [PATCHES 0/10] Fixes queued up for 5.10 Jens Axboe
2020-10-16 16:02 ` [PATCH 01/18] io_uring: Fix sizeof() mismatch Jens Axboe
2020-10-16 16:02 ` [PATCH 02/18] readahead: use limited read-ahead to satisfy read Jens Axboe
2020-10-16 16:02 ` [PATCH 03/18] io_uring: don't clear IOCB_NOWAIT for async buffered retry Jens Axboe
2020-10-16 16:02 ` [PATCH 04/18] io_uring: don't set COMP_LOCKED if won't put Jens Axboe
2020-10-16 16:02 ` [PATCH 05/18] io_uring: don't unnecessarily clear F_LINK_TIMEOUT Jens Axboe
2020-10-16 16:02 ` [PATCH 06/18] io_uring: don't put a poll req under spinlock Jens Axboe
2020-10-16 16:02 ` [PATCH 07/18] io_uring: dig out COMP_LOCK from deep call chain Jens Axboe
2020-10-16 16:02 ` Jens Axboe [this message]
2020-10-16 16:02 ` [PATCH 09/18] Revert "io_uring: mark io_uring_fops/io_op_defs as __read_mostly" Jens Axboe
2020-10-16 16:02 ` [PATCH 10/18] io_uring: fix error path cleanup in io_sqe_files_register() Jens Axboe
2020-10-16 16:02 ` [PATCH 11/18] io-wq: assign NUMA node locality if appropriate Jens Axboe
2020-10-16 16:02 ` [PATCH 12/18] io_uring: pass required context in as flags Jens Axboe
2020-10-16 16:02 ` [PATCH 13/18] io_uring: rely solely on work flags to determine personality Jens Axboe
2020-10-16 16:02 ` [PATCH 14/18] io_uring: move io identity items into separate struct Jens Axboe
2020-10-16 16:02 ` [PATCH 15/18] io_uring: COW io_identity on mismatch Jens Axboe
2020-10-16 16:02 ` [PATCH 16/18] io_uring: store io_identity in io_uring_task Jens Axboe
2020-10-16 16:02 ` [PATCH 17/18] io_uring: assign new io_identity for task if members have changed Jens Axboe
2020-10-16 16:02 ` [PATCH 18/18] io_uring: use percpu counters to track inflight requests Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox