public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting
  2025-10-21 17:55 [PATCHSET 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
@ 2025-10-21 17:55 ` Jens Axboe
  2025-10-21 23:35   ` Gabriel Krisman Bertazi
  0 siblings, 1 reply; 6+ messages in thread
From: Jens Axboe @ 2025-10-21 17:55 UTC (permalink / raw)
  To: io-uring; +Cc: changfengnan, xiaobing.li, lidiangang, Jens Axboe, stable

getrusage() does a lot more than what the SQPOLL accounting needs, the
latter only cares about (and uses) the stime. Rather than do a full
RUSAGE_SELF summation, just query the used stime instead.

Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c |  9 +++++----
 io_uring/sqpoll.c | 34 ++++++++++++++++++++--------------
 io_uring/sqpoll.h |  1 +
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77..966e06b078f6 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 {
 	struct io_overflow_cqe *ocqe;
 	struct io_rings *r = ctx->rings;
-	struct rusage sq_usage;
 	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
 	unsigned int sq_head = READ_ONCE(r->sq.head);
 	unsigned int sq_tail = READ_ONCE(r->sq.tail);
@@ -152,14 +151,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 		 * thread termination.
 		 */
 		if (tsk) {
+			struct timespec64 ts;
+
 			get_task_struct(tsk);
 			rcu_read_unlock();
-			getrusage(tsk, RUSAGE_SELF, &sq_usage);
+			ts = io_sq_cpu_time(tsk);
 			put_task_struct(tsk);
 			sq_pid = sq->task_pid;
 			sq_cpu = sq->sq_cpu;
-			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
-					 + sq_usage.ru_stime.tv_usec);
+			sq_total_time = (ts.tv_sec * 1000000
+					 + ts.tv_nsec / 1000);
 			sq_work_time = sq->work_time;
 		} else {
 			rcu_read_unlock();
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a3f11349ce06..8705b0aa82e0 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -11,6 +11,7 @@
 #include <linux/audit.h>
 #include <linux/security.h>
 #include <linux/cpuset.h>
+#include <linux/sched/cputime.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -169,6 +170,22 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 	return READ_ONCE(sqd->state);
 }
 
+struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(tsk, &utime, &stime);
+	return ns_to_timespec64(stime);
+}
+
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
+{
+	struct timespec64 ts;
+
+	ts = timespec64_sub(io_sq_cpu_time(current), start);
+	sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
+
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
 	unsigned int to_submit;
@@ -255,23 +272,12 @@ static bool io_sq_tw_pending(struct llist_node *retry_list)
 	return retry_list || !llist_empty(&tctx->task_list);
 }
 
-static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
-{
-	struct rusage end;
-
-	getrusage(current, RUSAGE_SELF, &end);
-	end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
-	end.ru_stime.tv_usec -= start->ru_stime.tv_usec;
-
-	sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
-}
-
 static int io_sq_thread(void *data)
 {
 	struct llist_node *retry_list = NULL;
 	struct io_sq_data *sqd = data;
 	struct io_ring_ctx *ctx;
-	struct rusage start;
+	struct timespec64 start;
 	unsigned long timeout = 0;
 	char buf[TASK_COMM_LEN] = {};
 	DEFINE_WAIT(wait);
@@ -317,7 +323,7 @@ static int io_sq_thread(void *data)
 		}
 
 		cap_entries = !list_is_singular(&sqd->ctx_list);
-		getrusage(current, RUSAGE_SELF, &start);
+		start = io_sq_cpu_time(current);
 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 			int ret = __io_sq_thread(ctx, cap_entries);
 
@@ -333,7 +339,7 @@ static int io_sq_thread(void *data)
 
 		if (sqt_spin || !time_after(jiffies, timeout)) {
 			if (sqt_spin) {
-				io_sq_update_worktime(sqd, &start);
+				io_sq_update_worktime(sqd, start);
 				timeout = jiffies + sqd->sq_thread_idle;
 			}
 			if (unlikely(need_resched())) {
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index b83dcdec9765..84ed2b312e88 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -29,6 +29,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
 void io_put_sq_data(struct io_sq_data *sqd);
 void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
 int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+struct timespec64 io_sq_cpu_time(struct task_struct *tsk);
 
 static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
 {
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting
  2025-10-21 17:55 ` [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting Jens Axboe
@ 2025-10-21 23:35   ` Gabriel Krisman Bertazi
  2025-10-22 16:48     ` Jens Axboe
  0 siblings, 1 reply; 6+ messages in thread
From: Gabriel Krisman Bertazi @ 2025-10-21 23:35 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, changfengnan, xiaobing.li, lidiangang, stable

Jens Axboe <axboe@kernel.dk> writes:

> getrusage() does a lot more than what the SQPOLL accounting needs, the
> latter only cares about (and uses) the stime. Rather than do a full
> RUSAGE_SELF summation, just query the used stime instead.
>
> Cc: stable@vger.kernel.org
> Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  io_uring/fdinfo.c |  9 +++++----
>  io_uring/sqpoll.c | 34 ++++++++++++++++++++--------------
>  io_uring/sqpoll.h |  1 +
>  3 files changed, 26 insertions(+), 18 deletions(-)
>
> diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
> index ff3364531c77..966e06b078f6 100644
> --- a/io_uring/fdinfo.c
> +++ b/io_uring/fdinfo.c
> @@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>  {
>  	struct io_overflow_cqe *ocqe;
>  	struct io_rings *r = ctx->rings;
> -	struct rusage sq_usage;
>  	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
>  	unsigned int sq_head = READ_ONCE(r->sq.head);
>  	unsigned int sq_tail = READ_ONCE(r->sq.tail);
> @@ -152,14 +151,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>  		 * thread termination.
>  		 */
>  		if (tsk) {
> +			struct timespec64 ts;
> +
>  			get_task_struct(tsk);
>  			rcu_read_unlock();
> -			getrusage(tsk, RUSAGE_SELF, &sq_usage);
> +			ts = io_sq_cpu_time(tsk);
>  			put_task_struct(tsk);
>  			sq_pid = sq->task_pid;
>  			sq_cpu = sq->sq_cpu;
> -			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
> -					 + sq_usage.ru_stime.tv_usec);
> +			sq_total_time = (ts.tv_sec * 1000000
> +					 + ts.tv_nsec / 1000);
>  			sq_work_time = sq->work_time;
>  		} else {
>  			rcu_read_unlock();
> diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
> index a3f11349ce06..8705b0aa82e0 100644
> --- a/io_uring/sqpoll.c
> +++ b/io_uring/sqpoll.c
> @@ -11,6 +11,7 @@
>  #include <linux/audit.h>
>  #include <linux/security.h>
>  #include <linux/cpuset.h>
> +#include <linux/sched/cputime.h>
>  #include <linux/io_uring.h>
>  
>  #include <uapi/linux/io_uring.h>
> @@ -169,6 +170,22 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
>  	return READ_ONCE(sqd->state);
>  }
>  
> +struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
> +{
> +	u64 utime, stime;
> +
> +	task_cputime_adjusted(tsk, &utime, &stime);
> +	return ns_to_timespec64(stime);
> +}
> +
> +static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
> +{
> +	struct timespec64 ts;
> +
> +	ts = timespec64_sub(io_sq_cpu_time(current), start);
> +	sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
> +}

Hi Jens,

Patch looks good. I'd just mention you are converting ns to timespec64,
just to convert it back to ms when writing to sqd->work_time and
sq_total_time.  I think wraparound is not a concern for
task_cputime_adjusted since this is the actual system cputime of a
single thread inside a u64.  So io_sq_cpu_time could just return ms
directly and io_sq_update_worktime would be trivial:

  sqd->work_time = io_sq_pu_time(current) - start.

Regardless:

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>

Thanks,


-- 
Gabriel Krisman Bertazi

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting
  2025-10-21 23:35   ` Gabriel Krisman Bertazi
@ 2025-10-22 16:48     ` Jens Axboe
  0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2025-10-22 16:48 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi
  Cc: io-uring, changfengnan, xiaobing.li, lidiangang, stable

On 10/21/25 5:35 PM, Gabriel Krisman Bertazi wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> getrusage() does a lot more than what the SQPOLL accounting needs, the
>> latter only cares about (and uses) the stime. Rather than do a full
>> RUSAGE_SELF summation, just query the used stime instead.
>>
>> Cc: stable@vger.kernel.org
>> Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  io_uring/fdinfo.c |  9 +++++----
>>  io_uring/sqpoll.c | 34 ++++++++++++++++++++--------------
>>  io_uring/sqpoll.h |  1 +
>>  3 files changed, 26 insertions(+), 18 deletions(-)
>>
>> diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
>> index ff3364531c77..966e06b078f6 100644
>> --- a/io_uring/fdinfo.c
>> +++ b/io_uring/fdinfo.c
>> @@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>>  {
>>  	struct io_overflow_cqe *ocqe;
>>  	struct io_rings *r = ctx->rings;
>> -	struct rusage sq_usage;
>>  	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
>>  	unsigned int sq_head = READ_ONCE(r->sq.head);
>>  	unsigned int sq_tail = READ_ONCE(r->sq.tail);
>> @@ -152,14 +151,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>>  		 * thread termination.
>>  		 */
>>  		if (tsk) {
>> +			struct timespec64 ts;
>> +
>>  			get_task_struct(tsk);
>>  			rcu_read_unlock();
>> -			getrusage(tsk, RUSAGE_SELF, &sq_usage);
>> +			ts = io_sq_cpu_time(tsk);
>>  			put_task_struct(tsk);
>>  			sq_pid = sq->task_pid;
>>  			sq_cpu = sq->sq_cpu;
>> -			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
>> -					 + sq_usage.ru_stime.tv_usec);
>> +			sq_total_time = (ts.tv_sec * 1000000
>> +					 + ts.tv_nsec / 1000);
>>  			sq_work_time = sq->work_time;
>>  		} else {
>>  			rcu_read_unlock();
>> diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
>> index a3f11349ce06..8705b0aa82e0 100644
>> --- a/io_uring/sqpoll.c
>> +++ b/io_uring/sqpoll.c
>> @@ -11,6 +11,7 @@
>>  #include <linux/audit.h>
>>  #include <linux/security.h>
>>  #include <linux/cpuset.h>
>> +#include <linux/sched/cputime.h>
>>  #include <linux/io_uring.h>
>>  
>>  #include <uapi/linux/io_uring.h>
>> @@ -169,6 +170,22 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
>>  	return READ_ONCE(sqd->state);
>>  }
>>  
>> +struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
>> +{
>> +	u64 utime, stime;
>> +
>> +	task_cputime_adjusted(tsk, &utime, &stime);
>> +	return ns_to_timespec64(stime);
>> +}
>> +
>> +static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
>> +{
>> +	struct timespec64 ts;
>> +
>> +	ts = timespec64_sub(io_sq_cpu_time(current), start);
>> +	sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
>> +}
> 
> Hi Jens,
> 
> Patch looks good. I'd just mention you are converting ns to timespec64,
> just to convert it back to ms when writing to sqd->work_time and
> sq_total_time.  I think wraparound is not a concern for
> task_cputime_adjusted since this is the actual system cputime of a
> single thread inside a u64.  So io_sq_cpu_time could just return ms
> directly and io_sq_update_worktime would be trivial:
> 
>   sqd->work_time = io_sq_pu_time(current) - start.

That's a good point - I'll update both patches, folding and incremental
like the below in. Thanks!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCHSET v2 0/2] Fix crazy CPU accounting usage for SQPOLL
@ 2025-10-22 17:02 Jens Axboe
  2025-10-22 17:02 ` [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting Jens Axboe
  2025-10-22 17:02 ` [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime usage Jens Axboe
  0 siblings, 2 replies; 6+ messages in thread
From: Jens Axboe @ 2025-10-22 17:02 UTC (permalink / raw)
  To: io-uring; +Cc: changfengnan, xiaobing.li, lidiangang, krisman

Hi,

Fengnan Chang reports [1] excessive CPU usage from the SQPOLL work vs
total time accounting, and that indeed does look like a problem. The
issue seems to be just eager time querying, and using essentially the
wrong interface. Patch 1 gets rid of getrusage() for this, and patch 2
attempts to be a bit smarter in how often the time needs to get
queried. Profile results in patch 2 as well.

Since v1:
- Store time in usec rather than convert between nsec and timespec64.

 io_uring/fdinfo.c |  8 +++----
 io_uring/sqpoll.c | 65 ++++++++++++++++++++++++++++++++++++++-----------------
 io_uring/sqpoll.h |  1 +
 3 files changed, 50 insertions(+), 24 deletions(-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting
  2025-10-22 17:02 [PATCHSET v2 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
@ 2025-10-22 17:02 ` Jens Axboe
  2025-10-22 17:02 ` [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime usage Jens Axboe
  1 sibling, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2025-10-22 17:02 UTC (permalink / raw)
  To: io-uring; +Cc: changfengnan, xiaobing.li, lidiangang, krisman, Jens Axboe,
	stable

getrusage() does a lot more than what the SQPOLL accounting needs, the
latter only cares about (and uses) the stime. Rather than do a full
RUSAGE_SELF summation, just query the used stime instead.

Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/fdinfo.c |  8 ++++----
 io_uring/sqpoll.c | 32 ++++++++++++++++++--------------
 io_uring/sqpoll.h |  1 +
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77..294c75a8a3bd 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 {
 	struct io_overflow_cqe *ocqe;
 	struct io_rings *r = ctx->rings;
-	struct rusage sq_usage;
 	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
 	unsigned int sq_head = READ_ONCE(r->sq.head);
 	unsigned int sq_tail = READ_ONCE(r->sq.tail);
@@ -152,14 +151,15 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 		 * thread termination.
 		 */
 		if (tsk) {
+			u64 usec;
+
 			get_task_struct(tsk);
 			rcu_read_unlock();
-			getrusage(tsk, RUSAGE_SELF, &sq_usage);
+			usec = io_sq_cpu_usec(tsk);
 			put_task_struct(tsk);
 			sq_pid = sq->task_pid;
 			sq_cpu = sq->sq_cpu;
-			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
-					 + sq_usage.ru_stime.tv_usec);
+			sq_total_time = usec;
 			sq_work_time = sq->work_time;
 		} else {
 			rcu_read_unlock();
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a3f11349ce06..2b816fdb9866 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -11,6 +11,7 @@
 #include <linux/audit.h>
 #include <linux/security.h>
 #include <linux/cpuset.h>
+#include <linux/sched/cputime.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -169,6 +170,20 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 	return READ_ONCE(sqd->state);
 }
 
+u64 io_sq_cpu_usec(struct task_struct *tsk)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(tsk, &utime, &stime);
+	do_div(stime, 1000);
+	return stime;
+}
+
+static void io_sq_update_worktime(struct io_sq_data *sqd, u64 usec)
+{
+	sqd->work_time += io_sq_cpu_usec(current) - usec;
+}
+
 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
 	unsigned int to_submit;
@@ -255,26 +270,15 @@ static bool io_sq_tw_pending(struct llist_node *retry_list)
 	return retry_list || !llist_empty(&tctx->task_list);
 }
 
-static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
-{
-	struct rusage end;
-
-	getrusage(current, RUSAGE_SELF, &end);
-	end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
-	end.ru_stime.tv_usec -= start->ru_stime.tv_usec;
-
-	sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
-}
-
 static int io_sq_thread(void *data)
 {
 	struct llist_node *retry_list = NULL;
 	struct io_sq_data *sqd = data;
 	struct io_ring_ctx *ctx;
-	struct rusage start;
 	unsigned long timeout = 0;
 	char buf[TASK_COMM_LEN] = {};
 	DEFINE_WAIT(wait);
+	u64 start;
 
 	/* offload context creation failed, just exit */
 	if (!current->io_uring) {
@@ -317,7 +321,7 @@ static int io_sq_thread(void *data)
 		}
 
 		cap_entries = !list_is_singular(&sqd->ctx_list);
-		getrusage(current, RUSAGE_SELF, &start);
+		start = io_sq_cpu_usec(current);
 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 			int ret = __io_sq_thread(ctx, cap_entries);
 
@@ -333,7 +337,7 @@ static int io_sq_thread(void *data)
 
 		if (sqt_spin || !time_after(jiffies, timeout)) {
 			if (sqt_spin) {
-				io_sq_update_worktime(sqd, &start);
+				io_sq_update_worktime(sqd, start);
 				timeout = jiffies + sqd->sq_thread_idle;
 			}
 			if (unlikely(need_resched())) {
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index b83dcdec9765..fd2f6f29b516 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -29,6 +29,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
 void io_put_sq_data(struct io_sq_data *sqd);
 void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
 int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+u64 io_sq_cpu_usec(struct task_struct *tsk);
 
 static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
 {
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime usage
  2025-10-22 17:02 [PATCHSET v2 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
  2025-10-22 17:02 ` [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting Jens Axboe
@ 2025-10-22 17:02 ` Jens Axboe
  1 sibling, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2025-10-22 17:02 UTC (permalink / raw)
  To: io-uring; +Cc: changfengnan, xiaobing.li, lidiangang, krisman, Jens Axboe,
	stable

The current approach is a bit naive, and hence calls the time querying
way too often. Only start the "doing work" timer when there's actual
work to do, and then use that information to terminate (and account) the
work time once done. This greatly reduces the frequency of these calls,
when they cannot have changed anyway.

Running a basic random reader that is setup to use SQPOLL, a profile
before this change shows these as the top cycle consumers:

+   32.60%  iou-sqp-1074  [kernel.kallsyms]  [k] thread_group_cputime_adjusted
+   19.97%  iou-sqp-1074  [kernel.kallsyms]  [k] thread_group_cputime
+   12.20%  io_uring      io_uring           [.] submitter_uring_fn
+    4.13%  iou-sqp-1074  [kernel.kallsyms]  [k] getrusage
+    2.45%  iou-sqp-1074  [kernel.kallsyms]  [k] io_submit_sqes
+    2.18%  iou-sqp-1074  [kernel.kallsyms]  [k] __pi_memset_generic
+    2.09%  iou-sqp-1074  [kernel.kallsyms]  [k] cputime_adjust

and after this change, top of profile looks as follows:

+   36.23%  io_uring     io_uring           [.] submitter_uring_fn
+   23.26%  iou-sqp-819  [kernel.kallsyms]  [k] io_sq_thread
+   10.14%  iou-sqp-819  [kernel.kallsyms]  [k] io_sq_tw
+    6.52%  iou-sqp-819  [kernel.kallsyms]  [k] tctx_task_work_run
+    4.82%  iou-sqp-819  [kernel.kallsyms]  [k] nvme_submit_cmds.part.0
+    2.91%  iou-sqp-819  [kernel.kallsyms]  [k] io_submit_sqes
[...]
     0.02%  iou-sqp-819  [kernel.kallsyms]  [k] cputime_adjust

where it's spending the cycles on things that actually matter.

Reported-by: Fengnan Chang <changfengnan@bytedance.com>
Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/sqpoll.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 2b816fdb9866..e22f072c7d5f 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -170,6 +170,11 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 	return READ_ONCE(sqd->state);
 }
 
+struct io_sq_time {
+	bool started;
+	u64 usec;
+};
+
 u64 io_sq_cpu_usec(struct task_struct *tsk)
 {
 	u64 utime, stime;
@@ -179,12 +184,24 @@ u64 io_sq_cpu_usec(struct task_struct *tsk)
 	return stime;
 }
 
-static void io_sq_update_worktime(struct io_sq_data *sqd, u64 usec)
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist)
+{
+	if (!ist->started)
+		return;
+	ist->started = false;
+	sqd->work_time += io_sq_cpu_usec(current) - ist->usec;
+}
+
+static void io_sq_start_worktime(struct io_sq_time *ist)
 {
-	sqd->work_time += io_sq_cpu_usec(current) - usec;
+	if (ist->started)
+		return;
+	ist->started = true;
+	ist->usec = io_sq_cpu_usec(current);
 }
 
-static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
+static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd,
+			  bool cap_entries, struct io_sq_time *ist)
 {
 	unsigned int to_submit;
 	int ret = 0;
@@ -197,6 +214,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 	if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
 		const struct cred *creds = NULL;
 
+		io_sq_start_worktime(ist);
+
 		if (ctx->sq_creds != current_cred())
 			creds = override_creds(ctx->sq_creds);
 
@@ -278,7 +297,6 @@ static int io_sq_thread(void *data)
 	unsigned long timeout = 0;
 	char buf[TASK_COMM_LEN] = {};
 	DEFINE_WAIT(wait);
-	u64 start;
 
 	/* offload context creation failed, just exit */
 	if (!current->io_uring) {
@@ -313,6 +331,7 @@ static int io_sq_thread(void *data)
 	mutex_lock(&sqd->lock);
 	while (1) {
 		bool cap_entries, sqt_spin = false;
+		struct io_sq_time ist = { };
 
 		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
 			if (io_sqd_handle_event(sqd))
@@ -321,9 +340,8 @@ static int io_sq_thread(void *data)
 		}
 
 		cap_entries = !list_is_singular(&sqd->ctx_list);
-		start = io_sq_cpu_usec(current);
 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-			int ret = __io_sq_thread(ctx, cap_entries);
+			int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);
 
 			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
 				sqt_spin = true;
@@ -331,15 +349,18 @@ static int io_sq_thread(void *data)
 		if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
 			sqt_spin = true;
 
-		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-			if (io_napi(ctx))
+		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+			if (io_napi(ctx)) {
+				io_sq_start_worktime(&ist);
 				io_napi_sqpoll_busy_poll(ctx);
+			}
+		}
+
+		io_sq_update_worktime(sqd, &ist);
 
 		if (sqt_spin || !time_after(jiffies, timeout)) {
-			if (sqt_spin) {
-				io_sq_update_worktime(sqd, start);
+			if (sqt_spin)
 				timeout = jiffies + sqd->sq_thread_idle;
-			}
 			if (unlikely(need_resched())) {
 				mutex_unlock(&sqd->lock);
 				cond_resched();
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-10-22 17:04 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-22 17:02 [PATCHSET v2 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
2025-10-22 17:02 ` [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting Jens Axboe
2025-10-22 17:02 ` [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime usage Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2025-10-21 17:55 [PATCHSET 0/2] Fix crazy CPU accounting usage for SQPOLL Jens Axboe
2025-10-21 17:55 ` [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU accounting Jens Axboe
2025-10-21 23:35   ` Gabriel Krisman Bertazi
2025-10-22 16:48     ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox