[PATCH for-next v2 12/12] io_uring: skip spinlocking for ->task_complete

public inbox for [email protected]
 help / color / mirror / Atom feed

From: Pavel Begunkov <[email protected]>
To: [email protected]
Cc: Jens Axboe <[email protected]>, [email protected]
Subject: [PATCH for-next v2 12/12] io_uring: skip spinlocking for ->task_complete
Date: Wed,  7 Dec 2022 03:53:37 +0000	[thread overview]
Message-ID: <2a8c91fd82cfcdcc1d2e5bac7051fe2c183bda73.1670384893.git.asml.silence@gmail.com> (raw)
In-Reply-To: <[email protected]>

->task_complete was added to serialised CQE posting by doing it from
the task context only (or fallback wq when the task is dead), and now we
can use that to avoid taking ->completion_lock while filling CQ entries.
The patch skips spinlocking only in two spots,
__io_submit_flush_completions() and flushing in io_aux_cqe, it's safer
and covers all cases we care about. Extra care is taken to force taking
the lock while queueing overflow entries.

It fundamentally relies on SINGLE_ISSUER to have only one task posting
events. It also need to take into account overflowed CQEs, flushing of
which happens in the cq wait path, and so this implementation also needs
DEFER_TASKRUN to limit waiters. For the same reason we disable it for
SQPOLL, and for IOPOLL as it won't benefit from it in any case.
DEFER_TASKRUN, SQPOLL and IOPOLL requirement may be relaxed in the
future.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 io_uring/io_uring.c | 71 +++++++++++++++++++++++++++++++++------------
 io_uring/io_uring.h | 12 ++++++--
 2 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0e424d8721ab..529ea5942dea 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -595,13 +595,25 @@ static inline void io_cq_unlock(struct io_ring_ctx *ctx)
 	spin_unlock(&ctx->completion_lock);
 }
 
+static inline void __io_cq_lock(struct io_ring_ctx *ctx)
+	__acquires(ctx->completion_lock)
+{
+	if (!ctx->task_complete)
+		spin_lock(&ctx->completion_lock);
+}
+
+static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
+{
+	if (!ctx->task_complete)
+		spin_unlock(&ctx->completion_lock);
+}
+
 /* keep it inlined for io_submit_flush_completions() */
-static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx)
+static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-
+	__io_cq_unlock(ctx);
 	io_commit_cqring_flush(ctx);
 	io_cqring_wake(ctx);
 }
@@ -609,7 +621,10 @@ static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx)
 void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
-	io_cq_unlock_post_inline(ctx);
+	io_commit_cqring(ctx);
+	spin_unlock(&ctx->completion_lock);
+	io_commit_cqring_flush(ctx);
+	io_cqring_wake(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -796,12 +811,13 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 	return &rings->cqes[off];
 }
 
-static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-			    bool allow_overflow)
+static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
+			      u32 cflags)
 {
 	struct io_uring_cqe *cqe;
 
-	lockdep_assert_held(&ctx->completion_lock);
+	if (!ctx->task_complete)
+		lockdep_assert_held(&ctx->completion_lock);
 
 	ctx->cq_extra++;
 
@@ -824,10 +840,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32
 		}
 		return true;
 	}
-
-	if (allow_overflow)
-		return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
-
 	return false;
 }
 
@@ -841,7 +853,17 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
 	for (i = 0; i < state->cqes_count; i++) {
 		struct io_uring_cqe *cqe = &state->cqes[i];
 
-		io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true);
+		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
+			if (ctx->task_complete) {
+				spin_lock(&ctx->completion_lock);
+				io_cqring_event_overflow(ctx, cqe->user_data,
+							cqe->res, cqe->flags, 0, 0);
+				spin_unlock(&ctx->completion_lock);
+			} else {
+				io_cqring_event_overflow(ctx, cqe->user_data,
+							cqe->res, cqe->flags, 0, 0);
+			}
+		}
 	}
 	state->cqes_count = 0;
 }
@@ -852,7 +874,10 @@ static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u
 	bool filled;
 
 	io_cq_lock(ctx);
-	filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow);
+	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
+	if (!filled && allow_overflow)
+		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+
 	io_cq_unlock_post(ctx);
 	return filled;
 }
@@ -876,10 +901,10 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 	lockdep_assert_held(&ctx->uring_lock);
 
 	if (ctx->submit_state.cqes_count == length) {
-		io_cq_lock(ctx);
+		__io_cq_lock(ctx);
 		__io_flush_post_cqes(ctx);
 		/* no need to flush - flush is deferred */
-		io_cq_unlock(ctx);
+		__io_cq_unlock_post(ctx);
 	}
 
 	/* For defered completions this is not as strict as it is otherwise,
@@ -1414,7 +1439,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_wq_work_node *node, *prev;
 	struct io_submit_state *state = &ctx->submit_state;
 
-	io_cq_lock(ctx);
+	__io_cq_lock(ctx);
 	/* must come first to preserve CQE ordering in failure cases */
 	if (state->cqes_count)
 		__io_flush_post_cqes(ctx);
@@ -1422,10 +1447,18 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 					    comp_list);
 
-		if (!(req->flags & REQ_F_CQE_SKIP))
-			io_fill_cqe_req(ctx, req);
+		if (!(req->flags & REQ_F_CQE_SKIP) &&
+		    unlikely(!__io_fill_cqe_req(ctx, req))) {
+			if (ctx->task_complete) {
+				spin_lock(&ctx->completion_lock);
+				io_req_cqe_overflow(req);
+				spin_unlock(&ctx->completion_lock);
+			} else {
+				io_req_cqe_overflow(req);
+			}
+		}
 	}
-	io_cq_unlock_post_inline(ctx);
+	__io_cq_unlock_post(ctx);
 
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 62227ec3260c..c117e029c8dc 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -110,7 +110,7 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 	return io_get_cqe_overflow(ctx, false);
 }
 
-static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
+static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req)
 {
 	struct io_uring_cqe *cqe;
@@ -122,7 +122,7 @@ static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
 	 */
 	cqe = io_get_cqe(ctx);
 	if (unlikely(!cqe))
-		return io_req_cqe_overflow(req);
+		return false;
 
 	trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
 				req->cqe.res, req->cqe.flags,
@@ -145,6 +145,14 @@ static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
 	return true;
 }
 
+static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	if (likely(__io_fill_cqe_req(ctx, req)))
+		return true;
+	return io_req_cqe_overflow(req);
+}
+
 static inline void req_set_fail(struct io_kiocb *req)
 {
 	req->flags |= REQ_F_FAIL;
-- 
2.38.1

     prev parent reply	other threads:[~2022-12-07  3:54 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-07  3:53 [PATCH for-next v2 00/12] CQ locking optimisation Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 01/12] io_uring: dont remove file from msg_ring reqs Pavel Begunkov
2022-12-07 13:52   ` Jens Axboe
2022-12-07 21:12     ` Pavel Begunkov
2022-12-07 21:23       ` Jens Axboe
2022-12-07  3:53 ` [PATCH for-next v2 02/12] io_uring: improve io_double_lock_ctx fail handling Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 03/12] io_uring: skip overflow CQE posting for dying ring Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 04/12] io_uring: don't check overflow flush failures Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 05/12] io_uring: complete all requests in task context Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 06/12] io_uring: force multishot CQEs into " Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 07/12] io_uring: use tw for putting rsrc Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 08/12] io_uring: never run tw and fallback in parallel Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 09/12] io_uring: get rid of double locking Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 10/12] io_uring: extract a io_msg_install_complete helper Pavel Begunkov
2022-12-07  3:53 ` [PATCH for-next v2 11/12] io_uring: do msg_ring in target task via tw Pavel Begunkov
2022-12-07 15:31   ` Jens Axboe
2022-12-07 15:51     ` Jens Axboe
2022-12-07 21:18       ` Pavel Begunkov
2022-12-07 21:22         ` Jens Axboe
2022-12-07  3:53 ` Pavel Begunkov [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:0e424d8721a dfblob:529ea5942de dfblob:62227ec3260
dfblob:c117e029c8d )
 OR (
bs:"[PATCH for-next v2 12/12] io_uring: skip spinlocking for ->task_complete" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2a8c91fd82cfcdcc1d2e5bac7051fe2c183bda73.1670384893.git.asml.silence@gmail.com \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox