* [PATCH 2/5] io_uring: mark exit side kworkers as task_work capable
2025-03-21 19:24 [PATCHSET RFC v2 0/5] Cancel and wait for all requests on exit Jens Axboe
2025-03-21 19:24 ` [PATCH 1/5] fs: gate final fput task_work on PF_NO_TASKWORK Jens Axboe
@ 2025-03-21 19:24 ` Jens Axboe
2025-03-21 19:24 ` [PATCH 3/5] io_uring: consider ring dead once the ref is marked dying Jens Axboe
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2025-03-21 19:24 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Jens Axboe
There are two types of work here:
1) Fallback work, if the task is exiting
2) The exit side cancelations
and both of them may do the final fput() of a file. When this happens,
fput() will schedule delayed work. This slows down exits when io_uring
needs to wait for that work to finish. It is possible to flush this via
flush_delayed_fput(), but that's a big hammer as other unrelated files
could be involved, and from other tasks as well.
Add two io_uring helpers to temporarily clear PF_NO_TASKWORK for the
worker threads, and run any queued task_work before setting the flag
again. Then we can ensure we only flush related items that received
their final fput as part of work cancelation and flushing.
For now these are io_uring private, but could obviously be made
generically available, should there be a need to do so.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/io_uring.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5f625be52e52..2b9dae588f04 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -238,6 +238,20 @@ static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx
wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
+static __cold void io_kworker_tw_start(void)
+{
+ if (WARN_ON_ONCE(!(current->flags & PF_NO_TASKWORK)))
+ return;
+ current->flags &= ~PF_NO_TASKWORK;
+}
+
+static __cold void io_kworker_tw_end(void)
+{
+ while (task_work_pending(current))
+ task_work_run();
+ current->flags |= PF_NO_TASKWORK;
+}
+
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -253,6 +267,8 @@ static __cold void io_fallback_req_func(struct work_struct *work)
struct io_kiocb *req, *tmp;
struct io_tw_state ts = {};
+ io_kworker_tw_start();
+
percpu_ref_get(&ctx->refs);
mutex_lock(&ctx->uring_lock);
llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
@@ -260,6 +276,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
percpu_ref_put(&ctx->refs);
+ io_kworker_tw_end();
}
static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
@@ -2879,6 +2896,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
struct io_tctx_node *node;
int ret;
+ io_kworker_tw_start();
+
/*
* If we're doing polled IO and end up having requests being
* submitted async (out-of-line), then completions can come in while
@@ -2935,6 +2954,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
*/
} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
+ io_kworker_tw_end();
+
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
exit.ctx = ctx;
--
2.49.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 5/5] io_uring: switch away from percpu refcounts
2025-03-21 19:24 [PATCHSET RFC v2 0/5] Cancel and wait for all requests on exit Jens Axboe
` (3 preceding siblings ...)
2025-03-21 19:24 ` [PATCH 4/5] io_uring: wait for cancelations on final ring put Jens Axboe
@ 2025-03-21 19:24 ` Jens Axboe
4 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2025-03-21 19:24 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Jens Axboe
For the common cases, the io_uring ref counts are all batched and hence
need not be a percpu reference. This saves some memory on systems, but
outside of that, it gets rid of needing a full RCU grace period on
tearing down the reference. With io_uring now waiting on cancelations
and IO during exit, this slows down the tear down a lot, up to 100x
as slow.
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/io_uring_types.h | 2 +-
io_uring/io_uring.c | 47 ++++++++++++----------------------
io_uring/io_uring.h | 3 ++-
io_uring/msg_ring.c | 4 +--
io_uring/refs.h | 43 +++++++++++++++++++++++++++++++
io_uring/register.c | 2 +-
io_uring/rw.c | 2 +-
io_uring/sqpoll.c | 2 +-
io_uring/zcrx.c | 4 +--
9 files changed, 70 insertions(+), 39 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 79e223fd4733..8894b0639a3a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -256,7 +256,7 @@ struct io_ring_ctx {
struct task_struct *submitter_task;
struct io_rings *rings;
- struct percpu_ref refs;
+ atomic_long_t refs;
clockid_t clockid;
enum tk_offsets clock_offset;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d9b65a322ae1..69b8f3237b1a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -252,13 +252,6 @@ static __cold void io_kworker_tw_end(void)
current->flags |= PF_NO_TASKWORK;
}
-static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
-{
- struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
-
- complete(&ctx->ref_comp);
-}
-
static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -269,13 +262,13 @@ static __cold void io_fallback_req_func(struct work_struct *work)
io_kworker_tw_start();
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
mutex_lock(&ctx->uring_lock);
llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
req->io_task_work.func(req, ts);
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
io_kworker_tw_end();
}
@@ -333,10 +326,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
hash_bits = clamp(hash_bits, 1, 8);
if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
goto err;
- if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
- 0, GFP_KERNEL))
- goto err;
+ io_ring_ref_init(ctx);
ctx->flags = p->flags;
ctx->hybrid_poll_time = LLONG_MAX;
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -360,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_futex_cache_init(ctx);
ret |= io_rsrc_cache_init(ctx);
if (ret)
- goto free_ref;
+ goto err;
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@@ -386,9 +377,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->mmap_lock);
return ctx;
-
-free_ref:
- percpu_ref_exit(&ctx->refs);
err:
io_free_alloc_caches(ctx);
kvfree(ctx->cancel_table.hbs);
@@ -556,7 +544,7 @@ static void io_queue_iowq(struct io_kiocb *req)
* worker for it).
*/
if (WARN_ON_ONCE(!same_thread_group(tctx->task, current) &&
- !percpu_ref_is_dying(&req->ctx->refs)))
+ !io_ring_ref_is_dying(req->ctx)))
atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
@@ -998,7 +986,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
ret = 1;
}
- percpu_ref_get_many(&ctx->refs, ret);
+ io_ring_ref_get_many(ctx, ret);
while (ret--) {
struct io_kiocb *req = reqs[ret];
@@ -1053,7 +1041,7 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw)
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
}
/*
@@ -1077,7 +1065,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
ctx_flush_and_put(ctx, ts);
ctx = req->ctx;
mutex_lock(&ctx->uring_lock);
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
}
INDIRECT_CALL_2(req->io_task_work.func,
io_poll_task_func, io_req_rw_complete,
@@ -1106,10 +1094,10 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
if (sync && last_ctx != req->ctx) {
if (last_ctx) {
flush_delayed_work(&last_ctx->fallback_work);
- percpu_ref_put(&last_ctx->refs);
+ io_ring_ref_put(last_ctx);
}
last_ctx = req->ctx;
- percpu_ref_get(&last_ctx->refs);
+ io_ring_ref_get(last_ctx);
}
if (llist_add(&req->io_task_work.node,
&req->ctx->fallback_llist))
@@ -1118,7 +1106,7 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
if (last_ctx) {
flush_delayed_work(&last_ctx->fallback_work);
- percpu_ref_put(&last_ctx->refs);
+ io_ring_ref_put(last_ctx);
}
}
@@ -1255,7 +1243,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
return;
}
- if (!percpu_ref_is_dying(&ctx->refs) &&
+ if (!io_ring_ref_is_dying(ctx) &&
!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))
return;
@@ -2739,7 +2727,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
nr++;
}
if (nr)
- percpu_ref_put_many(&ctx->refs, nr);
+ io_ring_ref_put_many(ctx, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -2773,7 +2761,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
static_branch_dec(&io_key_has_sqarray);
- percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
if (ctx->hash_map)
@@ -2798,7 +2785,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
* might've been lost due to loose synchronisation.
*/
wake_up_all(&ctx->poll_wq);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
}
__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
@@ -2816,9 +2803,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
* only need to sync with it, which is done by injecting a tw
*/
init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
out:
spin_unlock(&ctx->completion_lock);
}
@@ -3005,7 +2992,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
struct creds *creds;
mutex_lock(&ctx->uring_lock);
- percpu_ref_kill(&ctx->refs);
+ io_ring_ref_kill(ctx);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 87f883130286..67e5921771be 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -13,6 +13,7 @@
#include "slist.h"
#include "filetable.h"
#include "opdef.h"
+#include "refs.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -143,7 +144,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
* Not from an SQE, as those cannot be submitted, but via
* updating tagged resources.
*/
- if (!percpu_ref_is_dying(&ctx->refs))
+ if (!io_ring_ref_is_dying(ctx))
lockdep_assert(current == ctx->submitter_task);
}
#endif
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 0bbcbbcdebfd..30d4cabb66d6 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -83,7 +83,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
}
if (req)
kmem_cache_free(req_cachep, req);
- percpu_ref_put(&ctx->refs);
+ io_ring_ref_put(ctx);
}
static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -95,7 +95,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
req->cqe.user_data = user_data;
io_req_set_res(req, res, cflags);
- percpu_ref_get(&ctx->refs);
+ io_ring_ref_get(ctx);
req->ctx = ctx;
req->tctx = NULL;
req->io_task_work.func = io_msg_tw_complete;
diff --git a/io_uring/refs.h b/io_uring/refs.h
index 63982ead9f7d..a794e6980cb8 100644
--- a/io_uring/refs.h
+++ b/io_uring/refs.h
@@ -52,4 +52,47 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
{
__io_req_set_refcount(req, 1);
}
+
+#define IO_RING_REF_DEAD (1ULL << 63)
+#define IO_RING_REF_MASK (~IO_RING_REF_DEAD)
+
+static inline bool io_ring_ref_is_dying(struct io_ring_ctx *ctx)
+{
+ return atomic_long_read(&ctx->refs) & IO_RING_REF_DEAD;
+}
+
+static inline void io_ring_ref_put_many(struct io_ring_ctx *ctx, int nr_refs)
+{
+ unsigned long refs;
+
+ refs = atomic_long_sub_return(nr_refs, &ctx->refs);
+ if (!(refs & IO_RING_REF_MASK))
+ complete(&ctx->ref_comp);
+}
+
+static inline void io_ring_ref_put(struct io_ring_ctx *ctx)
+{
+ io_ring_ref_put_many(ctx, 1);
+}
+
+static inline void io_ring_ref_kill(struct io_ring_ctx *ctx)
+{
+ atomic_long_xor(IO_RING_REF_DEAD, &ctx->refs);
+ io_ring_ref_put(ctx);
+}
+
+static inline void io_ring_ref_init(struct io_ring_ctx *ctx)
+{
+ atomic_long_set(&ctx->refs, 1);
+}
+
+static inline void io_ring_ref_get_many(struct io_ring_ctx *ctx, int nr_refs)
+{
+ atomic_long_add(nr_refs, &ctx->refs);
+}
+
+static inline void io_ring_ref_get(struct io_ring_ctx *ctx)
+{
+ atomic_long_inc(&ctx->refs);
+}
#endif
diff --git a/io_uring/register.c b/io_uring/register.c
index cc23a4c205cd..54fe94a0101b 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -637,7 +637,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* We don't quiesce the refs for register anymore and so it can't be
* dying as we're holding a file ref here.
*/
- if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
+ if (WARN_ON_ONCE(io_ring_ref_is_dying(ctx)))
return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 039e063f7091..e010d548edea 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -496,7 +496,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
* Don't attempt to reissue from that path, just let it fail with
* -EAGAIN.
*/
- if (percpu_ref_is_dying(&ctx->refs))
+ if (io_ring_ref_is_dying(ctx))
return false;
io_meta_restore(io, &rw->kiocb);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index d037cc68e9d3..b71f8d52386e 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -184,7 +184,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
* Don't submit if refs are dying, good for io_uring_register(),
* but also it is relied upon by io_ring_exit_work()
*/
- if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
+ if (to_submit && likely(!io_ring_ref_is_dying(ctx)) &&
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 9c95b5b6ec4e..07719e3bf1b3 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -629,7 +629,7 @@ static int io_pp_zc_init(struct page_pool *pp)
if (pp->p.dma_dir != DMA_FROM_DEVICE)
return -EOPNOTSUPP;
- percpu_ref_get(&ifq->ctx->refs);
+ io_ring_ref_get(ifq->ctx);
return 0;
}
@@ -640,7 +640,7 @@ static void io_pp_zc_destroy(struct page_pool *pp)
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
return;
- percpu_ref_put(&ifq->ctx->refs);
+ io_ring_ref_put(ifq->ctx);
}
static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
--
2.49.0
^ permalink raw reply related [flat|nested] 7+ messages in thread