[RFC 2/2] io_uring: reduce sheduling due to tw

public inbox for [email protected]
 help / color / mirror / Atom feed

From: Pavel Begunkov <[email protected]>
To: [email protected]
Cc: Jens Axboe <[email protected]>,
	[email protected], [email protected]
Subject: [RFC 2/2] io_uring: reduce sheduling due to tw
Date: Fri, 10 Mar 2023 19:04:16 +0000	[thread overview]
Message-ID: <1001c8552cc79afc98ab778219e6ea3190ff37d9.1678474375.git.asml.silence@gmail.com> (raw)
In-Reply-To: <[email protected]>

Every task_work will try to wake the task to be executed, which causes
excessive scheduling with corresponding overhead. For some tw it's
justified, but others won't do much but post a single CQE.

When a task waits for multiple cqes, every such task_work will wake it
up. Instead, the task may give a hint about how many cqes it waits for,
io_req_local_work_add() will compare against it and skip wake ups
if #cqes + #tw items is not enough to satisfy the task. The optimisation
is used only for simple enough tws, more complex and/or urgent items
will force wake up. It's also limited to DEFER_TASKRUN.

The trade-off is having extra atomics in io_req_local_work_add() but
saving more on rescheduling the task..

Signed-off-by: Pavel Begunkov <[email protected]>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 41 +++++++++++++++++++++-------------
 io_uring/io_uring.h            |  1 +
 io_uring/notif.h               |  2 +-
 io_uring/rw.c                  |  2 +-
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 00689c12f6ab..fdf0ae28023d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -295,7 +295,7 @@ struct io_ring_ctx {
 		spinlock_t		completion_lock;
 
 		bool			poll_multi_queue;
-		bool			cq_waiting;
+		atomic_t		cq_wait_nr;
 
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 42ada470845f..0fa4dee8dcf4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1279,31 +1279,38 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
 	}
 }
 
-static void io_req_local_work_add(struct io_kiocb *req)
+static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	bool first;
 
 	percpu_ref_get(&ctx->refs);
 
-	if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
-		goto put_ref;
-
+	first = llist_add(&req->io_task_work.node, &ctx->work_llist);
 	/* needed for the following wake up */
 	smp_mb__after_atomic();
 
-	if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
-		io_move_task_work_from_local(ctx);
-		goto put_ref;
+	if (first) {
+		if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
+			io_move_task_work_from_local(ctx);
+			goto put_ref;
+		}
+
+		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+		if (ctx->has_evfd)
+			io_eventfd_signal(ctx);
 	}
 
-	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
-		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (ctx->has_evfd)
-		io_eventfd_signal(ctx);
+	if (atomic_read(&ctx->cq_wait_nr) <= 0)
+		goto put_ref;
 
-	if (READ_ONCE(ctx->cq_waiting))
-		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
+	if (!(flags & IOU_F_TWQ_FACILE))
+		atomic_set(&ctx->cq_wait_nr, 0);
+	else if (atomic_dec_return(&ctx->cq_wait_nr) > 0)
+		goto put_ref;
 
+	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 put_ref:
 	percpu_ref_put(&ctx->refs);
 }
@@ -1315,7 +1322,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 
 	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
 	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
-		io_req_local_work_add(req);
+		io_req_local_work_add(req, flags);
 		return;
 	}
 
@@ -2601,7 +2608,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		unsigned long check_cq;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-			WRITE_ONCE(ctx->cq_waiting, 1);
+			int to_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
+
+			atomic_set(&ctx->cq_wait_nr, to_wait);
 			set_current_state(TASK_INTERRUPTIBLE);
 		} else {
 			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
@@ -2610,7 +2619,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 		ret = io_cqring_wait_schedule(ctx, &iowq);
 		__set_current_state(TASK_RUNNING);
-		WRITE_ONCE(ctx->cq_waiting, 0);
+		atomic_set(&ctx->cq_wait_nr, 0);
 
 		if (ret < 0)
 			break;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cd2e702f206c..98ff9b71d498 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -18,6 +18,7 @@
 enum {
 	/* don't use deferred task_work */
 	IOU_F_TWQ_FORCE_NORMAL			= 1,
+	IOU_F_TWQ_FACILE			= 2,
 };
 
 enum {
diff --git a/io_uring/notif.h b/io_uring/notif.h
index c88c800cd89d..ec9998fb0be6 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
 
 	/* drop slot's master ref */
 	if (refcount_dec_and_test(&nd->uarg.refcnt))
-		io_req_task_work_add(notif);
+		__io_req_task_work_add(notif, IOU_F_TWQ_FACILE);
 }
 
 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c233910e200..a4578c120973 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, io_fixup_rw_res(req, res), 0);
 	req->io_task_work.func = io_req_rw_complete;
-	io_req_task_work_add(req);
+	__io_req_task_work_add(req, IOU_F_TWQ_FACILE);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
-- 
2.39.1

next prev parent reply	other threads:[~2023-03-10 19:05 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-10 19:04 [RFC 0/2] optimise local-tw task resheduling Pavel Begunkov
2023-03-10 19:04 ` [RFC 1/2] io_uring: add tw add flags Pavel Begunkov
2023-03-10 19:04 ` Pavel Begunkov [this message]
2023-03-11 17:24 ` [RFC 0/2] optimise local-tw task resheduling Jens Axboe
2023-03-11 20:45   ` Pavel Begunkov
2023-03-11 20:53     ` Pavel Begunkov
2023-03-12 15:31       ` Jens Axboe
2023-03-13  3:52         ` Pavel Begunkov
2023-03-12 15:30     ` Jens Axboe
2023-03-13  3:45       ` Pavel Begunkov
2023-03-13 14:16         ` Jens Axboe
2023-03-13 17:50           ` Pavel Begunkov
2023-03-13 22:01             ` Jens Axboe
2023-03-16 12:25   ` Pavel Begunkov
2023-03-15  2:35 ` Ming Lei
2023-03-15 16:53   ` Pavel Begunkov
2023-03-16  1:25     ` Ming Lei

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:00689c12f6a dfblob:fdf0ae28023 dfblob:42ada470845
dfblob:0fa4dee8dcf dfblob:cd2e702f206 dfblob:98ff9b71d49
dfblob:c88c800cd89 dfblob:ec9998fb0be dfblob:4c233910e20
dfblob:a4578c12097 )
 OR (
bs:"[RFC 2/2] io_uring: reduce sheduling due to tw" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1001c8552cc79afc98ab778219e6ea3190ff37d9.1678474375.git.asml.silence@gmail.com \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox