From: Pavel Begunkov <asml.silence@gmail.com>
To: io-uring@vger.kernel.org
Cc: asml.silence@gmail.com
Subject: [PATCH v2 8/8] io_uring: drain based on allocates reqs
Date: Fri, 9 May 2025 12:12:54 +0100 [thread overview]
Message-ID: <46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com> (raw)
In-Reply-To: <cover.1746788718.git.asml.silence@gmail.com>
Don't rely on CQ sequence numbers for draining, as it has become messy
and needs cq_extra adjustments. Instead, base it on the number of
allocated requests and only allow flushing when all requests are in the
drain list.
As a result, cq_extra is gone, no overhead for its accounting in aux cqe
posting, less bloating as it was inlined before, and it's in general
simpler than trying to track where we should bump it and where it should
be put back like in cases of overflow. Also, it'll likely help with
cleaning and unifying some of the CQ posting helpers.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/linux/io_uring_types.h | 2 +-
io_uring/io_uring.c | 83 +++++++++++++++-------------------
io_uring/io_uring.h | 3 +-
3 files changed, 38 insertions(+), 50 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 73b289b48280..00dbd7cd0e7d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -341,7 +341,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
- unsigned cq_extra;
void *cq_wait_arg;
size_t cq_wait_size;
@@ -417,6 +416,7 @@ struct io_ring_ctx {
struct callback_head poll_wq_task_work;
struct list_head defer_list;
+ unsigned nr_drained;
struct io_alloc_cache msg_cache;
spinlock_t msg_lock;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 14188f49a4ce..0fda1b1a33ae 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
- u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
-static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
+static unsigned io_linked_nr(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *tmp;
+ unsigned nr = 0;
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
+ io_for_each_link(tmp, req)
+ nr++;
+ return nr;
}
-static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
bool drain_seen = false, first = true;
+ lockdep_assert_held(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
+
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
- if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq))
- break;
+ if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+ return;
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue(de->req);
kfree(de);
first = false;
}
}
-static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
-{
- guard(spinlock)(&ctx->completion_lock);
- __io_queue_deferred(ctx);
-}
-
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->poll_activated)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
if (ctx->has_evfd)
io_eventfd_signal(ctx, true);
}
@@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
* on the floor.
*/
WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
@@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
{
struct io_uring_cqe *cqe;
- ctx->cq_extra++;
-
if (likely(io_get_cqe(ctx, &cqe))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
@@ -1456,6 +1452,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
+
+ if (unlikely(ctx->drain_active))
+ io_queue_deferred(ctx);
+
ctx->submit_state.cq_flush = false;
}
@@ -1643,23 +1643,14 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-static u32 io_get_sequence(struct io_kiocb *req)
-{
- u32 seq = req->ctx->cached_sq_head;
- struct io_kiocb *cur;
-
- /* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(cur, req)
- seq--;
- return seq;
-}
-
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
bool drain = req->flags & IOSQE_IO_DRAIN;
struct io_defer_entry *de;
+ struct io_kiocb *tmp;
+ int nr = 0;
de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
if (!de) {
@@ -1667,17 +1658,17 @@ static __cold void io_drain_req(struct io_kiocb *req)
return;
}
+ io_for_each_link(tmp, req)
+ nr++;
io_prep_async_link(req);
trace_io_uring_defer(req);
de->req = req;
- de->seq = io_get_sequence(req);
- scoped_guard(spinlock, &ctx->completion_lock) {
- list_add_tail(&de->list, &ctx->defer_list);
- __io_queue_deferred(ctx);
- if (!drain && list_empty(&ctx->defer_list))
- ctx->drain_active = false;
- }
+ ctx->nr_drained += io_linked_nr(req);
+ list_add_tail(&de->list, &ctx->defer_list);
+ io_queue_deferred(ctx);
+ if (!drain && list_empty(&ctx->defer_list))
+ ctx->drain_active = false;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -2260,10 +2251,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
(!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
- /* drop invalid entries */
- spin_lock(&ctx->completion_lock);
- ctx->cq_extra--;
- spin_unlock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
@@ -2681,13 +2668,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
return off;
}
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
int nr = 0;
- mutex_lock(&ctx->uring_lock);
-
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
kmem_cache_free(req_cachep, req);
@@ -2697,7 +2682,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
ctx->nr_req_allocated -= nr;
percpu_ref_put_many(&ctx->refs, nr);
}
- mutex_unlock(&ctx->uring_lock);
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+ guard(mutex)(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
}
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -3002,20 +2992,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct io_defer_entry *de;
LIST_HEAD(list);
- spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task_safe(de->req, tctx, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
- spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
@@ -3090,8 +3079,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
- ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
mutex_lock(&ctx->uring_lock);
+ ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
ret |= io_poll_remove_all(ctx, tctx, cancel_all);
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e4050b2d0821..81f22196a57d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
{
io_lockdep_assert_cq_locked(ctx);
- ctx->cq_extra++;
ctx->submit_state.cq_flush = true;
return io_get_cqe(ctx, cqe_ret);
}
@@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ if (unlikely(ctx->off_timeout_used ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}
--
2.49.0
next prev parent reply other threads:[~2025-05-09 11:11 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-09 11:12 [PATCH v2 0/8] allocated requests based drain and fixes Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 1/8] io_uring: account drain memory to cgroup Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 2/8] io_uring: fix spurious drain flushing Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 3/8] io_uring: simplify drain ret passing Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 4/8] io_uring: remove drain prealloc checks Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 5/8] io_uring: consolidate drain seq checking Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 6/8] io_uring: open code io_account_cq_overflow() Pavel Begunkov
2025-05-09 11:12 ` [PATCH v2 7/8] io_uring: count allocated requests Pavel Begunkov
2025-05-09 11:12 ` Pavel Begunkov [this message]
2025-05-13 10:37 ` [PATCH v2 8/8] io_uring: drain based on allocates reqs Andy Shevchenko
2025-05-13 13:35 ` Pavel Begunkov
2025-05-09 14:02 ` [PATCH v2 0/8] allocated requests based drain and fixes Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com \
--to=asml.silence@gmail.com \
--cc=io-uring@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox