* [PATCH v3 1/6] io_uring: make del_task_file more forgiving
2021-03-05 12:58 [PATCH v3 5.12 0/6] remove task file notes Pavel Begunkov
@ 2021-03-05 12:58 ` Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 2/6] io_uring: introduce ctx to tctx back map Pavel Begunkov
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-03-05 12:58 UTC (permalink / raw)
To: Jens Axboe, io-uring
Rework io_uring_del_task_file(), so it accepts an index to delete, and
it's not necessarily have to be in the ->xa. Infer file from xa_erase()
to maintain a single origin of truth.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 99e37f9688bf..bcf2c08fc12e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8779,15 +8779,18 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_task_file(struct file *file)
+static void io_uring_del_task_file(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
+ struct file *file;
+
+ file = xa_erase(&tctx->xa, index);
+ if (!file)
+ return;
if (tctx->last == file)
tctx->last = NULL;
- file = xa_erase(&tctx->xa, (unsigned long)file);
- if (file)
- fput(file);
+ fput(file);
}
static void io_uring_clean_tctx(struct io_uring_task *tctx)
@@ -8796,7 +8799,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
unsigned long index;
xa_for_each(&tctx->xa, index, file)
- io_uring_del_task_file(file);
+ io_uring_del_task_file(index);
if (tctx->io_wq) {
io_wq_put_and_exit(tctx->io_wq);
tctx->io_wq = NULL;
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 2/6] io_uring: introduce ctx to tctx back map
2021-03-05 12:58 [PATCH v3 5.12 0/6] remove task file notes Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 1/6] io_uring: make del_task_file more forgiving Pavel Begunkov
@ 2021-03-05 12:58 ` Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 3/6] io_uring: do ctx initiated file note removal Pavel Begunkov
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-03-05 12:58 UTC (permalink / raw)
To: Jens Axboe, io-uring
For each pair tcxt-ctx create an object and chain it into ctx, so we
have a way to traverse all tctx that are using current ctx. Preparation
patch, will be used later.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 44 insertions(+), 14 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bcf2c08fc12e..e62f28512cc4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -454,6 +454,7 @@ struct io_ring_ctx {
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
+ struct list_head tctx_list;
};
/*
@@ -805,6 +806,13 @@ struct io_kiocb {
struct io_wq_work work;
};
+struct io_tctx_node {
+ struct list_head ctx_node;
+ struct task_struct *task;
+ struct file *file;
+ struct io_ring_ctx *ctx;
+};
+
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
@@ -1144,6 +1152,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
+ INIT_LIST_HEAD(&ctx->tctx_list);
INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
return ctx;
@@ -8742,6 +8751,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
{
struct io_uring_task *tctx = current->io_uring;
+ struct io_tctx_node *node;
int ret;
if (unlikely(!tctx)) {
@@ -8754,13 +8764,25 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
void *old = xa_load(&tctx->xa, (unsigned long)file);
if (!old) {
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+ node->ctx = ctx;
+ node->file = file;
+ node->task = current;
+
get_file(file);
ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
- file, GFP_KERNEL));
+ node, GFP_KERNEL));
if (ret) {
fput(file);
+ kfree(node);
return ret;
}
+
+ mutex_lock(&ctx->uring_lock);
+ list_add(&node->ctx_node, &ctx->tctx_list);
+ mutex_unlock(&ctx->uring_lock);
}
tctx->last = file;
}
@@ -8782,23 +8804,31 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
static void io_uring_del_task_file(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
- struct file *file;
+ struct io_tctx_node *node;
- file = xa_erase(&tctx->xa, index);
- if (!file)
+ node = xa_erase(&tctx->xa, index);
+ if (!node)
return;
- if (tctx->last == file)
+ WARN_ON_ONCE(current != node->task);
+ WARN_ON_ONCE(list_empty(&node->ctx_node));
+
+ mutex_lock(&node->ctx->uring_lock);
+ list_del(&node->ctx_node);
+ mutex_unlock(&node->ctx->uring_lock);
+
+ if (tctx->last == node->file)
tctx->last = NULL;
- fput(file);
+ fput(node->file);
+ kfree(node);
}
static void io_uring_clean_tctx(struct io_uring_task *tctx)
{
- struct file *file;
+ struct io_tctx_node *node;
unsigned long index;
- xa_for_each(&tctx->xa, index, file)
+ xa_for_each(&tctx->xa, index, node)
io_uring_del_task_file(index);
if (tctx->io_wq) {
io_wq_put_and_exit(tctx->io_wq);
@@ -8809,13 +8839,13 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
void __io_uring_files_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
- struct file *file;
+ struct io_tctx_node *node;
unsigned long index;
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
- xa_for_each(&tctx->xa, index, file)
- io_uring_cancel_task_requests(file->private_data, files);
+ xa_for_each(&tctx->xa, index, node)
+ io_uring_cancel_task_requests(node->ctx, files);
atomic_dec(&tctx->in_idle);
if (files)
@@ -8881,11 +8911,11 @@ void __io_uring_task_cancel(void)
atomic_inc(&tctx->in_idle);
if (tctx->sqpoll) {
- struct file *file;
+ struct io_tctx_node *node;
unsigned long index;
- xa_for_each(&tctx->xa, index, file)
- io_uring_cancel_sqpoll(file->private_data);
+ xa_for_each(&tctx->xa, index, node)
+ io_uring_cancel_sqpoll(node->ctx);
}
do {
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 3/6] io_uring: do ctx initiated file note removal
2021-03-05 12:58 [PATCH v3 5.12 0/6] remove task file notes Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 1/6] io_uring: make del_task_file more forgiving Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 2/6] io_uring: introduce ctx to tctx back map Pavel Begunkov
@ 2021-03-05 12:58 ` Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 4/6] io_uring: don't take task ring-file notes Pavel Begunkov
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-03-05 12:58 UTC (permalink / raw)
To: Jens Axboe, io-uring
Another preparation patch. When full quiesce is done on ctx exit, use
task_work infra to remove corresponding to the ctx io_uring->xa entries.
For that we use the back tctx map. Also use ->in_idle to prevent
removing it while we traversing ->xa on cancellation, just ignore it.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 46 insertions(+), 2 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e62f28512cc4..9865b2c708c2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -987,6 +987,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_UNLINKAT] = {},
};
+static void io_uring_del_task_file(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
@@ -8528,10 +8529,33 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
return executed;
}
+struct io_tctx_exit {
+ struct callback_head task_work;
+ struct completion completion;
+ unsigned long index;
+};
+
+static void io_tctx_exit_cb(struct callback_head *cb)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_tctx_exit *work;
+
+ work = container_of(cb, struct io_tctx_exit, task_work);
+ /*
+ * When @in_idle, we're in cancellation and it's racy to remove the
+ * node. It'll be removed by the end of cancellation, just ignore it.
+ */
+ if (!atomic_read(&tctx->in_idle))
+ io_uring_del_task_file(work->index);
+ complete(&work->completion);
+}
+
static void io_ring_exit_work(struct work_struct *work)
{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
- exit_work);
+ struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
+ struct io_tctx_exit exit;
+ struct io_tctx_node *node;
+ int ret;
/*
* If we're doing polled IO and end up having requests being
@@ -8542,6 +8566,26 @@ static void io_ring_exit_work(struct work_struct *work)
do {
io_uring_try_cancel_requests(ctx, NULL, NULL);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+
+ mutex_lock(&ctx->uring_lock);
+ while (!list_empty(&ctx->tctx_list)) {
+ node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
+ ctx_node);
+ exit.index = (unsigned long)node->file;
+ init_completion(&exit.completion);
+ init_task_work(&exit.task_work, io_tctx_exit_cb);
+ ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
+ if (WARN_ON_ONCE(ret))
+ continue;
+ wake_up_process(node->task);
+
+ mutex_unlock(&ctx->uring_lock);
+ wait_for_completion(&exit.completion);
+ cond_resched();
+ mutex_lock(&ctx->uring_lock);
+ }
+ mutex_unlock(&ctx->uring_lock);
+
io_ring_ctx_free(ctx);
}
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 4/6] io_uring: don't take task ring-file notes
2021-03-05 12:58 [PATCH v3 5.12 0/6] remove task file notes Pavel Begunkov
` (2 preceding siblings ...)
2021-03-05 12:58 ` [PATCH v3 3/6] io_uring: do ctx initiated file note removal Pavel Begunkov
@ 2021-03-05 12:58 ` Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 5/6] io_uring: index io_uring->xa by ctx not file Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 6/6] io_uring: warn when ring exit takes too long Pavel Begunkov
5 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-03-05 12:58 UTC (permalink / raw)
To: Jens Axboe, io-uring
With ->flush() gone we're now leaving all uring file notes until the
task dies/execs, so the ctx will not be freed until all tasks that have
ever submit a request die. It was nicer with flush but not much, we
could have locked as described ctx in many cases.
Now we guarantee that ctx outlives all tctx in a sense that
io_ring_exit_work() waits for all tctxs to drop their corresponding
enties in ->xa, and ctx won't go away until then. Hence, additional
io_uring file reference (a.k.a. task file notes) are not needed anymore.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9865b2c708c2..d819d389f4ee 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8815,11 +8815,9 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
node->file = file;
node->task = current;
- get_file(file);
ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
node, GFP_KERNEL));
if (ret) {
- fput(file);
kfree(node);
return ret;
}
@@ -8850,6 +8848,8 @@ static void io_uring_del_task_file(unsigned long index)
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
+ if (!tctx)
+ return;
node = xa_erase(&tctx->xa, index);
if (!node)
return;
@@ -8863,7 +8863,6 @@ static void io_uring_del_task_file(unsigned long index)
if (tctx->last == node->file)
tctx->last = NULL;
- fput(node->file);
kfree(node);
}
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 5/6] io_uring: index io_uring->xa by ctx not file
2021-03-05 12:58 [PATCH v3 5.12 0/6] remove task file notes Pavel Begunkov
` (3 preceding siblings ...)
2021-03-05 12:58 ` [PATCH v3 4/6] io_uring: don't take task ring-file notes Pavel Begunkov
@ 2021-03-05 12:58 ` Pavel Begunkov
2021-03-05 12:58 ` [PATCH v3 6/6] io_uring: warn when ring exit takes too long Pavel Begunkov
5 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-03-05 12:58 UTC (permalink / raw)
To: Jens Axboe, io-uring
We don't use task file notes anymore, and no need left in indexing
task->io_uring->xa by file, and replace it with ctx. It's better
design-wise, especially since we keep a dangling file, and so have to
keep an eye on not dereferencing it.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 24 +++++++++++-------------
include/linux/io_uring.h | 2 +-
2 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d819d389f4ee..92ba3034de64 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -809,7 +809,6 @@ struct io_kiocb {
struct io_tctx_node {
struct list_head ctx_node;
struct task_struct *task;
- struct file *file;
struct io_ring_ctx *ctx;
};
@@ -8532,7 +8531,7 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
- unsigned long index;
+ struct io_ring_ctx *ctx;
};
static void io_tctx_exit_cb(struct callback_head *cb)
@@ -8546,7 +8545,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
* node. It'll be removed by the end of cancellation, just ignore it.
*/
if (!atomic_read(&tctx->in_idle))
- io_uring_del_task_file(work->index);
+ io_uring_del_task_file((unsigned long)work->ctx);
complete(&work->completion);
}
@@ -8571,7 +8570,7 @@ static void io_ring_exit_work(struct work_struct *work)
while (!list_empty(&ctx->tctx_list)) {
node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
ctx_node);
- exit.index = (unsigned long)node->file;
+ exit.ctx = ctx;
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
@@ -8792,7 +8791,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -8804,18 +8803,17 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
return ret;
tctx = current->io_uring;
}
- if (tctx->last != file) {
- void *old = xa_load(&tctx->xa, (unsigned long)file);
+ if (tctx->last != ctx) {
+ void *old = xa_load(&tctx->xa, (unsigned long)ctx);
if (!old) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return -ENOMEM;
node->ctx = ctx;
- node->file = file;
node->task = current;
- ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
+ ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
node, GFP_KERNEL));
if (ret) {
kfree(node);
@@ -8826,7 +8824,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
list_add(&node->ctx_node, &ctx->tctx_list);
mutex_unlock(&ctx->uring_lock);
}
- tctx->last = file;
+ tctx->last = ctx;
}
/*
@@ -8861,7 +8859,7 @@ static void io_uring_del_task_file(unsigned long index)
list_del(&node->ctx_node);
mutex_unlock(&node->ctx->uring_lock);
- if (tctx->last == node->file)
+ if (tctx->last == node->ctx)
tctx->last = NULL;
kfree(node);
}
@@ -9163,7 +9161,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(ctx, f.file);
+ ret = io_uring_add_task_file(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -9372,7 +9370,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
if (fd < 0)
return fd;
- ret = io_uring_add_task_file(ctx, file);
+ ret = io_uring_add_task_file(ctx);
if (ret) {
put_unused_fd(fd);
return ret;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 7cb7bd0e334c..9761a0ec9f95 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -18,7 +18,7 @@ struct io_uring_task {
/* submission side */
struct xarray xa;
struct wait_queue_head wait;
- struct file *last;
+ void *last;
void *io_wq;
struct percpu_counter inflight;
atomic_t in_idle;
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 6/6] io_uring: warn when ring exit takes too long
2021-03-05 12:58 [PATCH v3 5.12 0/6] remove task file notes Pavel Begunkov
` (4 preceding siblings ...)
2021-03-05 12:58 ` [PATCH v3 5/6] io_uring: index io_uring->xa by ctx not file Pavel Begunkov
@ 2021-03-05 12:58 ` Pavel Begunkov
5 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-03-05 12:58 UTC (permalink / raw)
To: Jens Axboe, io-uring
We use system_unbound_wq to run io_ring_exit_work(), so it's hard to
monitor whether removal hang or not. Add WARN_ONCE to catch hangs.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 92ba3034de64..116d082a248f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8552,6 +8552,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
static void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
+ unsigned long timeout = jiffies + HZ * 60 * 5;
struct io_tctx_exit exit;
struct io_tctx_node *node;
int ret;
@@ -8564,10 +8565,14 @@ static void io_ring_exit_work(struct work_struct *work)
*/
do {
io_uring_try_cancel_requests(ctx, NULL, NULL);
+
+ WARN_ON_ONCE(time_after(jiffies, timeout));
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
mutex_lock(&ctx->uring_lock);
while (!list_empty(&ctx->tctx_list)) {
+ WARN_ON_ONCE(time_after(jiffies, timeout));
+
node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
ctx_node);
exit.ctx = ctx;
--
2.24.0
^ permalink raw reply related [flat|nested] 7+ messages in thread