[PATCH 5/5] io_uring: switch away from percpu refcounts

public inbox for [email protected]
 help / color / mirror / Atom feed

From: Jens Axboe <[email protected]>
To: [email protected]
Cc: [email protected], Jens Axboe <[email protected]>
Subject: [PATCH 5/5] io_uring: switch away from percpu refcounts
Date: Fri, 21 Mar 2025 13:24:59 -0600	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

For the common cases, the io_uring ref counts are all batched and hence
need not be a percpu reference. This saves some memory on systems, but
outside of that, it gets rid of needing a full RCU grace period on
tearing down the reference. With io_uring now waiting on cancelations
and IO during exit, this slows down the tear down a lot, up to 100x
as slow.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 47 ++++++++++++----------------------
 io_uring/io_uring.h            |  3 ++-
 io_uring/msg_ring.c            |  4 +--
 io_uring/refs.h                | 43 +++++++++++++++++++++++++++++++
 io_uring/register.c            |  2 +-
 io_uring/rw.c                  |  2 +-
 io_uring/sqpoll.c              |  2 +-
 io_uring/zcrx.c                |  4 +--
 9 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 79e223fd4733..8894b0639a3a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -256,7 +256,7 @@ struct io_ring_ctx {
 
 		struct task_struct	*submitter_task;
 		struct io_rings		*rings;
-		struct percpu_ref	refs;
+		atomic_long_t		refs;
 
 		clockid_t		clockid;
 		enum tk_offsets		clock_offset;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d9b65a322ae1..69b8f3237b1a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -252,13 +252,6 @@ static __cold void io_kworker_tw_end(void)
 	current->flags |= PF_NO_TASKWORK;
 }
 
-static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
-{
-	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
-
-	complete(&ctx->ref_comp);
-}
-
 static __cold void io_fallback_req_func(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -269,13 +262,13 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 
 	io_kworker_tw_start();
 
-	percpu_ref_get(&ctx->refs);
+	io_ring_ref_get(ctx);
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, ts);
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
-	percpu_ref_put(&ctx->refs);
+	io_ring_ref_put(ctx);
 	io_kworker_tw_end();
 }
 
@@ -333,10 +326,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	hash_bits = clamp(hash_bits, 1, 8);
 	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
 		goto err;
-	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
-			    0, GFP_KERNEL))
-		goto err;
 
+	io_ring_ref_init(ctx);
 	ctx->flags = p->flags;
 	ctx->hybrid_poll_time = LLONG_MAX;
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -360,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	ret |= io_futex_cache_init(ctx);
 	ret |= io_rsrc_cache_init(ctx);
 	if (ret)
-		goto free_ref;
+		goto err;
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -386,9 +377,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->mmap_lock);
 
 	return ctx;
-
-free_ref:
-	percpu_ref_exit(&ctx->refs);
 err:
 	io_free_alloc_caches(ctx);
 	kvfree(ctx->cancel_table.hbs);
@@ -556,7 +544,7 @@ static void io_queue_iowq(struct io_kiocb *req)
 	 * worker for it).
 	 */
 	if (WARN_ON_ONCE(!same_thread_group(tctx->task, current) &&
-			 !percpu_ref_is_dying(&req->ctx->refs)))
+			 !io_ring_ref_is_dying(req->ctx)))
 		atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
 
 	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
@@ -998,7 +986,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 		ret = 1;
 	}
 
-	percpu_ref_get_many(&ctx->refs, ret);
+	io_ring_ref_get_many(ctx, ret);
 	while (ret--) {
 		struct io_kiocb *req = reqs[ret];
 
@@ -1053,7 +1041,7 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw)
 
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
-	percpu_ref_put(&ctx->refs);
+	io_ring_ref_put(ctx);
 }
 
 /*
@@ -1077,7 +1065,7 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
 			ctx_flush_and_put(ctx, ts);
 			ctx = req->ctx;
 			mutex_lock(&ctx->uring_lock);
-			percpu_ref_get(&ctx->refs);
+			io_ring_ref_get(ctx);
 		}
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
@@ -1106,10 +1094,10 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
 		if (sync && last_ctx != req->ctx) {
 			if (last_ctx) {
 				flush_delayed_work(&last_ctx->fallback_work);
-				percpu_ref_put(&last_ctx->refs);
+				io_ring_ref_put(last_ctx);
 			}
 			last_ctx = req->ctx;
-			percpu_ref_get(&last_ctx->refs);
+			io_ring_ref_get(last_ctx);
 		}
 		if (llist_add(&req->io_task_work.node,
 			      &req->ctx->fallback_llist))
@@ -1118,7 +1106,7 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
 
 	if (last_ctx) {
 		flush_delayed_work(&last_ctx->fallback_work);
-		percpu_ref_put(&last_ctx->refs);
+		io_ring_ref_put(last_ctx);
 	}
 }
 
@@ -1255,7 +1243,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 		return;
 	}
 
-	if (!percpu_ref_is_dying(&ctx->refs) &&
+	if (!io_ring_ref_is_dying(ctx) &&
 	    !task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))
 		return;
 
@@ -2739,7 +2727,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 		nr++;
 	}
 	if (nr)
-		percpu_ref_put_many(&ctx->refs, nr);
+		io_ring_ref_put_many(ctx, nr);
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -2773,7 +2761,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
 		static_branch_dec(&io_key_has_sqarray);
 
-	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
 	io_req_caches_free(ctx);
 	if (ctx->hash_map)
@@ -2798,7 +2785,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
 	 * might've been lost due to loose synchronisation.
 	 */
 	wake_up_all(&ctx->poll_wq);
-	percpu_ref_put(&ctx->refs);
+	io_ring_ref_put(ctx);
 }
 
 __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
@@ -2816,9 +2803,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
 	 * only need to sync with it, which is done by injecting a tw
 	 */
 	init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
-	percpu_ref_get(&ctx->refs);
+	io_ring_ref_get(ctx);
 	if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
-		percpu_ref_put(&ctx->refs);
+		io_ring_ref_put(ctx);
 out:
 	spin_unlock(&ctx->completion_lock);
 }
@@ -3005,7 +2992,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	struct creds *creds;
 
 	mutex_lock(&ctx->uring_lock);
-	percpu_ref_kill(&ctx->refs);
+	io_ring_ref_kill(ctx);
 	xa_for_each(&ctx->personalities, index, creds)
 		io_unregister_personality(ctx, index);
 	mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 87f883130286..67e5921771be 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -13,6 +13,7 @@
 #include "slist.h"
 #include "filetable.h"
 #include "opdef.h"
+#include "refs.h"
 
 #ifndef CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -143,7 +144,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 		 * Not from an SQE, as those cannot be submitted, but via
 		 * updating tagged resources.
 		 */
-		if (!percpu_ref_is_dying(&ctx->refs))
+		if (!io_ring_ref_is_dying(ctx))
 			lockdep_assert(current == ctx->submitter_task);
 	}
 #endif
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 0bbcbbcdebfd..30d4cabb66d6 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -83,7 +83,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
 	}
 	if (req)
 		kmem_cache_free(req_cachep, req);
-	percpu_ref_put(&ctx->refs);
+	io_ring_ref_put(ctx);
 }
 
 static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -95,7 +95,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 	req->cqe.user_data = user_data;
 	io_req_set_res(req, res, cflags);
-	percpu_ref_get(&ctx->refs);
+	io_ring_ref_get(ctx);
 	req->ctx = ctx;
 	req->tctx = NULL;
 	req->io_task_work.func = io_msg_tw_complete;
diff --git a/io_uring/refs.h b/io_uring/refs.h
index 63982ead9f7d..a794e6980cb8 100644
--- a/io_uring/refs.h
+++ b/io_uring/refs.h
@@ -52,4 +52,47 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
 {
 	__io_req_set_refcount(req, 1);
 }
+
+#define IO_RING_REF_DEAD	(1ULL << 63)
+#define IO_RING_REF_MASK	(~IO_RING_REF_DEAD)
+
+static inline bool io_ring_ref_is_dying(struct io_ring_ctx *ctx)
+{
+	return atomic_long_read(&ctx->refs) & IO_RING_REF_DEAD;
+}
+
+static inline void io_ring_ref_put_many(struct io_ring_ctx *ctx, int nr_refs)
+{
+	unsigned long refs;
+
+	refs = atomic_long_sub_return(nr_refs, &ctx->refs);
+	if (!(refs & IO_RING_REF_MASK))
+		complete(&ctx->ref_comp);
+}
+
+static inline void io_ring_ref_put(struct io_ring_ctx *ctx)
+{
+	io_ring_ref_put_many(ctx, 1);
+}
+
+static inline void io_ring_ref_kill(struct io_ring_ctx *ctx)
+{
+	atomic_long_xor(IO_RING_REF_DEAD, &ctx->refs);
+	io_ring_ref_put(ctx);
+}
+
+static inline void io_ring_ref_init(struct io_ring_ctx *ctx)
+{
+	atomic_long_set(&ctx->refs, 1);
+}
+
+static inline void io_ring_ref_get_many(struct io_ring_ctx *ctx, int nr_refs)
+{
+	atomic_long_add(nr_refs, &ctx->refs);
+}
+
+static inline void io_ring_ref_get(struct io_ring_ctx *ctx)
+{
+	atomic_long_inc(&ctx->refs);
+}
 #endif
diff --git a/io_uring/register.c b/io_uring/register.c
index cc23a4c205cd..54fe94a0101b 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -637,7 +637,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	 * We don't quiesce the refs for register anymore and so it can't be
 	 * dying as we're holding a file ref here.
 	 */
-	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
+	if (WARN_ON_ONCE(io_ring_ref_is_dying(ctx)))
 		return -ENXIO;
 
 	if (ctx->submitter_task && ctx->submitter_task != current)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 039e063f7091..e010d548edea 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -496,7 +496,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	 * Don't attempt to reissue from that path, just let it fail with
 	 * -EAGAIN.
 	 */
-	if (percpu_ref_is_dying(&ctx->refs))
+	if (io_ring_ref_is_dying(ctx))
 		return false;
 
 	io_meta_restore(io, &rw->kiocb);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index d037cc68e9d3..b71f8d52386e 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -184,7 +184,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 		 * Don't submit if refs are dying, good for io_uring_register(),
 		 * but also it is relied upon by io_ring_exit_work()
 		 */
-		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
+		if (to_submit && likely(!io_ring_ref_is_dying(ctx)) &&
 		    !(ctx->flags & IORING_SETUP_R_DISABLED))
 			ret = io_submit_sqes(ctx, to_submit);
 		mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 9c95b5b6ec4e..07719e3bf1b3 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -629,7 +629,7 @@ static int io_pp_zc_init(struct page_pool *pp)
 	if (pp->p.dma_dir != DMA_FROM_DEVICE)
 		return -EOPNOTSUPP;
 
-	percpu_ref_get(&ifq->ctx->refs);
+	io_ring_ref_get(ifq->ctx);
 	return 0;
 }
 
@@ -640,7 +640,7 @@ static void io_pp_zc_destroy(struct page_pool *pp)
 
 	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
 		return;
-	percpu_ref_put(&ifq->ctx->refs);
+	io_ring_ref_put(ifq->ctx);
 }
 
 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
-- 
2.49.0

     prev parent reply	other threads:[~2025-03-21 19:31 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-03-21 19:24 [PATCHSET RFC v2 0/5] Cancel and wait for all requests on exit Jens Axboe
2025-03-21 19:24 ` [PATCH 1/5] fs: gate final fput task_work on PF_NO_TASKWORK Jens Axboe
2025-03-21 19:24 ` [PATCH 2/5] io_uring: mark exit side kworkers as task_work capable Jens Axboe
2025-03-21 19:24 ` [PATCH 3/5] io_uring: consider ring dead once the ref is marked dying Jens Axboe
2025-03-21 21:22   ` Pavel Begunkov
2025-03-21 19:24 ` [PATCH 4/5] io_uring: wait for cancelations on final ring put Jens Axboe
2025-03-21 19:24 ` Jens Axboe [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:79e223fd473 dfblob:8894b0639a3 dfblob:d9b65a322ae
dfblob:69b8f3237b1 dfblob:87f88313028 dfblob:67e5921771b
dfblob:0bbcbbcdebf dfblob:30d4cabb66d dfblob:63982ead9f7
dfblob:a794e6980cb dfblob:cc23a4c205c dfblob:54fe94a0101
dfblob:039e063f709 dfblob:e010d548ede dfblob:d037cc68e9d
dfblob:b71f8d52386 dfblob:9c95b5b6ec4 dfblob:07719e3bf1b )
 OR (
bs:"[PATCH 5/5] io_uring: switch away from percpu refcounts" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox