public inbox for [email protected]
 help / color / mirror / Atom feed
From: Jens Axboe <[email protected]>
To: Pavel Begunkov <[email protected]>,
	David Wei <[email protected]>,
	[email protected]
Subject: Re: [PATCH v1 1/4] io_uring: only account cqring wait time as iowait if enabled for a ring
Date: Sun, 25 Feb 2024 14:11:10 -0700	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

On 2/25/24 9:43 AM, Jens Axboe wrote:
> If you are motivated, please dig into it. If not, I guess I will take a
> look this week. 

The straight forward approach - add a nr_short_wait and ->in_short_wait
and ensure that the idle governor factors that in. Not sure how
palatable it is, would be nice fold iowait under this, but doesn't
really work with how we pass back the previous state.

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b96e3da0fedd..39f05d6062e1 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -119,7 +119,7 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_short_waiters)
 {
 	int bucket = 0;
 
@@ -129,7 +129,7 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
 	 * This allows us to calculate
 	 * E(duration)|iowait
 	 */
-	if (nr_iowaiters)
+	if (nr_short_waiters)
 		bucket = BUCKETS/2;
 
 	if (duration_ns < 10ULL * NSEC_PER_USEC)
@@ -152,10 +152,10 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
  * to be, the higher this multiplier, and thus the higher
  * the barrier to go to an expensive C state.
  */
-static inline int performance_multiplier(unsigned int nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_short_waiters)
 {
 	/* for IO wait tasks (per cpu!) we add 10x each */
-	return 1 + 10 * nr_iowaiters;
+	return 1 + 10 * nr_short_waiters;
 }
 
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
@@ -266,7 +266,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	u64 predicted_ns;
 	u64 interactivity_req;
-	unsigned int nr_iowaiters;
+	unsigned int nr_short_waiters;
 	ktime_t delta, delta_tick;
 	int i, idx;
 
@@ -275,7 +275,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		data->needs_update = 0;
 	}
 
-	nr_iowaiters = nr_iowait_cpu(dev->cpu);
+	nr_short_waiters = nr_iowait_cpu(dev->cpu) + nr_short_wait_cpu(dev->cpu);
 
 	/* Find the shortest expected idle interval. */
 	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
@@ -290,7 +290,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		}
 
 		data->next_timer_ns = delta;
-		data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+		data->bucket = which_bucket(data->next_timer_ns, nr_short_waiters);
 
 		/* Round up the result for half microseconds. */
 		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
@@ -308,7 +308,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 */
 		data->next_timer_ns = KTIME_MAX;
 		delta_tick = TICK_NSEC / 2;
-		data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+		data->bucket = which_bucket(KTIME_MAX, nr_short_waiters);
 	}
 
 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
@@ -341,7 +341,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * latency_req to determine the maximum exit latency.
 		 */
 		interactivity_req = div64_u64(predicted_ns,
-					      performance_multiplier(nr_iowaiters));
+					      performance_multiplier(nr_short_waiters));
 		if (latency_req > interactivity_req)
 			latency_req = interactivity_req;
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..b04c08cadf1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -923,6 +923,7 @@ struct task_struct {
 	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
+	unsigned			in_short_wait:1;
 #ifndef TIF_RESTORE_SIGMASK
 	unsigned			restore_sigmask:1;
 #endif
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 0108a38bb64d..12f5795c4c32 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -21,6 +21,7 @@ extern unsigned int nr_running(void);
 extern bool single_task_running(void);
 extern unsigned int nr_iowait(void);
 extern unsigned int nr_iowait_cpu(int cpu);
+unsigned int nr_short_wait_cpu(int cpu);
 
 static inline int sched_info_on(void)
 {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f6332fc56bed..024af44ead12 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2519,7 +2519,7 @@ static bool current_pending_io(void)
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq)
 {
-	int io_wait, ret;
+	int short_wait, ret;
 
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2537,15 +2537,15 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 * can take into account that the task is waiting for IO - turns out
 	 * to be important for low QD IO.
 	 */
-	io_wait = current->in_iowait;
+	short_wait = current->in_short_wait;
 	if (current_pending_io())
-		current->in_iowait = 1;
+		current->in_short_wait = 1;
 	ret = 0;
 	if (iowq->timeout == KTIME_MAX)
 		schedule();
 	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
 		ret = -ETIME;
-	current->in_iowait = io_wait;
+	current->in_short_wait = short_wait;
 	return ret;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..dc7a08db8921 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3790,6 +3790,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
+	} else if (p->in_short_wait) {
+		atomic_dec(&task_rq(p)->nr_short_wait);
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4356,6 +4358,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
 				atomic_dec(&task_rq(p)->nr_iowait);
+			} else if (p->in_short_wait) {
+				atomic_dec(&task_rq(p)->nr_short_wait);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5506,6 +5510,11 @@ unsigned int nr_iowait(void)
 	return sum;
 }
 
+unsigned int nr_short_wait_cpu(int cpu)
+{
+	return atomic_read(&cpu_rq(cpu)->nr_short_wait);
+}
+
 #ifdef CONFIG_SMP
 
 /*
@@ -6683,6 +6692,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			if (prev->in_iowait) {
 				atomic_inc(&rq->nr_iowait);
 				delayacct_blkio_start();
+			} else if (prev->in_short_wait) {
+				atomic_inc(&rq->nr_short_wait);
 			}
 		}
 		switch_count = &prev->nvcsw;
@@ -10030,6 +10041,7 @@ void __init sched_init(void)
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
+		atomic_set(&rq->nr_short_wait, 0);
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a..9d4fc0b9de26 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6721,7 +6721,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
 	 * passed.
 	 */
-	if (p->in_iowait)
+	if (p->in_iowait || p->in_short_wait)
 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 
 	for_each_sched_entity(se) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..0530e9e97ecb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1050,6 +1050,7 @@ struct rq {
 #endif
 
 	atomic_t		nr_iowait;
+	atomic_t		nr_short_wait;
 
 #ifdef CONFIG_SCHED_DEBUG
 	u64 last_seen_need_resched_ns;

-- 
Jens Axboe


  reply	other threads:[~2024-02-25 21:11 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-24  5:07 [PATCH v1 1/4] io_uring: only account cqring wait time as iowait if enabled for a ring David Wei
2024-02-24  5:07 ` [PATCH v1 2/4] liburing: add examples/proxy to .gitignore David Wei
2024-02-24  5:07 ` [PATCH v1 3/4] liburing: add helper for IORING_REGISTER_IOWAIT David Wei
2024-02-24 15:29   ` Jens Axboe
2024-02-24 16:39     ` David Wei
2024-02-24  5:07 ` [PATCH v1 4/4] liburing: add unit test for io_uring_register_iowait() David Wei
2024-02-24 15:28 ` [PATCH v1 1/4] io_uring: only account cqring wait time as iowait if enabled for a ring Jens Axboe
2024-02-24 15:31 ` Pavel Begunkov
2024-02-24 17:20   ` David Wei
2024-02-24 18:55     ` Jens Axboe
2024-02-25  1:39       ` Pavel Begunkov
2024-02-25 16:43         ` Jens Axboe
2024-02-25 21:11           ` Jens Axboe [this message]
2024-02-25 21:33             ` Jens Axboe
2024-02-26 14:56             ` Pavel Begunkov
2024-02-26 15:22               ` Jens Axboe
2024-02-24 18:51   ` Jens Axboe
2024-02-25  0:58     ` Pavel Begunkov
2024-02-25 16:39       ` Jens Axboe
2024-03-05 14:59         ` Christian Loehle

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox