* [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection
@ 2025-10-20 11:30 Fengnan Chang
2025-10-20 14:59 ` Gabriel Krisman Bertazi
2025-10-20 15:12 ` Jens Axboe
0 siblings, 2 replies; 6+ messages in thread
From: Fengnan Chang @ 2025-10-20 11:30 UTC (permalink / raw)
To: axboe, xiaobing.li, asml.silence, io-uring; +Cc: Fengnan Chang, Diangang Li
In previous versions, getrusage was always called in sqrthread
to count work time, but this could incur some overhead.
This patch turn off stats by default, and introduces a new flag
IORING_SETUP_SQTHREAD_STATS that allows user to enable the
collection of statistics in the sqthread.
./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 ./testfile
IOPS base: 570K, patch: 590K
./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 /dev/nvme1n1
IOPS base: 826K, patch: 889K
Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Reviewed-by: Diangang Li <lidiangang@bytedance.com>
---
include/uapi/linux/io_uring.h | 5 +++++
io_uring/fdinfo.c | 15 ++++++++++-----
io_uring/io_uring.h | 3 ++-
io_uring/sqpoll.c | 10 +++++++---
io_uring/sqpoll.h | 1 +
5 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 263bed13473e..8c5cb9533950 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,6 +231,11 @@ enum io_uring_sqe_flags_bit {
*/
#define IORING_SETUP_CQE_MIXED (1U << 18)
+/*
+ * Enable SQPOLL thread stats collection
+ */
+#define IORING_SETUP_SQTHREAD_STATS (1U << 19)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77..4c532e414255 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -154,13 +154,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
if (tsk) {
get_task_struct(tsk);
rcu_read_unlock();
- getrusage(tsk, RUSAGE_SELF, &sq_usage);
+ if (sq->enable_work_time_stat)
+ getrusage(tsk, RUSAGE_SELF, &sq_usage);
put_task_struct(tsk);
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
- sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+ if (sq->enable_work_time_stat) {
+ sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+ sq_usage.ru_stime.tv_usec);
- sq_work_time = sq->work_time;
+ sq_work_time = sq->work_time;
+ }
} else {
rcu_read_unlock();
}
@@ -168,8 +171,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "SqThread:\t%d\n", sq_pid);
seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
- seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
- seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
+ if (ctx->flags & IORING_SETUP_SQTHREAD_STATS) {
+ seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
+ seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
+ }
seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
for (i = 0; i < ctx->file_table.data.nr; i++) {
struct file *f = NULL;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 46d9141d772a..949dc7cba111 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -54,7 +54,8 @@
IORING_SETUP_REGISTERED_FD_ONLY |\
IORING_SETUP_NO_SQARRAY |\
IORING_SETUP_HYBRID_IOPOLL |\
- IORING_SETUP_CQE_MIXED)
+ IORING_SETUP_CQE_MIXED |\
+ IORING_SETUP_SQTHREAD_STATS)
#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
IORING_ENTER_SQ_WAKEUP |\
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a3f11349ce06..46bcd4854abc 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -161,6 +161,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
mutex_init(&sqd->lock);
init_waitqueue_head(&sqd->wait);
init_completion(&sqd->exited);
+ sqd->enable_work_time_stat = false;
return sqd;
}
@@ -317,7 +318,8 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
- getrusage(current, RUSAGE_SELF, &start);
+ if (sqd->enable_work_time_stat)
+ getrusage(current, RUSAGE_SELF, &start);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
@@ -333,7 +335,8 @@ static int io_sq_thread(void *data)
if (sqt_spin || !time_after(jiffies, timeout)) {
if (sqt_spin) {
- io_sq_update_worktime(sqd, &start);
+ if (sqd->enable_work_time_stat)
+ io_sq_update_worktime(sqd, &start);
timeout = jiffies + sqd->sq_thread_idle;
}
if (unlikely(need_resched())) {
@@ -445,7 +448,8 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
ret = PTR_ERR(sqd);
goto err;
}
-
+ if (ctx->flags & IORING_SETUP_SQTHREAD_STATS)
+ sqd->enable_work_time_stat = true;
ctx->sq_creds = get_current_cred();
ctx->sq_data = sqd;
ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index b83dcdec9765..55f2e4d46d54 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -19,6 +19,7 @@ struct io_sq_data {
u64 work_time;
unsigned long state;
struct completion exited;
+ bool enable_work_time_stat;
};
int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
--
2.20.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection
2025-10-20 11:30 [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection Fengnan Chang
@ 2025-10-20 14:59 ` Gabriel Krisman Bertazi
2025-10-21 8:50 ` [External] " Fengnan Chang
2025-10-20 15:12 ` Jens Axboe
1 sibling, 1 reply; 6+ messages in thread
From: Gabriel Krisman Bertazi @ 2025-10-20 14:59 UTC (permalink / raw)
To: Fengnan Chang; +Cc: axboe, xiaobing.li, asml.silence, io-uring, Diangang Li
Fengnan Chang <changfengnan@bytedance.com> writes:
> In previous versions, getrusage was always called in sqrthread
> to count work time, but this could incur some overhead.
> This patch turn off stats by default, and introduces a new flag
> IORING_SETUP_SQTHREAD_STATS that allows user to enable the
> collection of statistics in the sqthread.
>
> ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 ./testfile
> IOPS base: 570K, patch: 590K
>
> ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 /dev/nvme1n1
> IOPS base: 826K, patch: 889K
>
> Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
> Reviewed-by: Diangang Li <lidiangang@bytedance.com>
> ---
> include/uapi/linux/io_uring.h | 5 +++++
> io_uring/fdinfo.c | 15 ++++++++++-----
> io_uring/io_uring.h | 3 ++-
> io_uring/sqpoll.c | 10 +++++++---
> io_uring/sqpoll.h | 1 +
> 5 files changed, 25 insertions(+), 9 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 263bed13473e..8c5cb9533950 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -231,6 +231,11 @@ enum io_uring_sqe_flags_bit {
> */
> #define IORING_SETUP_CQE_MIXED (1U << 18)
>
> +/*
> + * Enable SQPOLL thread stats collection
> + */
> +#define IORING_SETUP_SQTHREAD_STATS (1U << 19)
> +
> enum io_uring_op {
> IORING_OP_NOP,
> IORING_OP_READV,
> diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
> index ff3364531c77..4c532e414255 100644
> --- a/io_uring/fdinfo.c
> +++ b/io_uring/fdinfo.c
> @@ -154,13 +154,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> if (tsk) {
> get_task_struct(tsk);
> rcu_read_unlock();
> - getrusage(tsk, RUSAGE_SELF, &sq_usage);
> + if (sq->enable_work_time_stat)
> + getrusage(tsk, RUSAGE_SELF, &sq_usage);
> put_task_struct(tsk);
If the usage statistics are disabled, you don't need to acquire and drop
the task_struct reference any longer. you can move the get/put_task_struct
into the if.
> sq_pid = sq->task_pid;
> sq_cpu = sq->sq_cpu;
> - sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
> + if (sq->enable_work_time_stat) {
> + sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
> + sq_usage.ru_stime.tv_usec);
> - sq_work_time = sq->work_time;
> + sq_work_time = sq->work_time;
> + }
> } else {
> rcu_read_unlock();
> }
> @@ -168,8 +171,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>
> seq_printf(m, "SqThread:\t%d\n", sq_pid);
> seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
> - seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
> - seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
> + if (ctx->flags & IORING_SETUP_SQTHREAD_STATS) {
> + seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
> + seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
> + }
It works, but it is weird that you gate the writing of sq_total_time on
(sq->enable_work_time_stat) and then, the display of it on (ctx->flags &
IORING_SETUP_SQTHREAD_STATS). Since a sqpoll can attend to more than
one ctx, I'd just check ctx->flags & IORING_SETUP_SQTHREAD_STATS
in both places in this function.
> seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
> for (i = 0; i < ctx->file_table.data.nr; i++) {
> struct file *f = NULL;
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 46d9141d772a..949dc7cba111 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -54,7 +54,8 @@
> IORING_SETUP_REGISTERED_FD_ONLY |\
> IORING_SETUP_NO_SQARRAY |\
> IORING_SETUP_HYBRID_IOPOLL |\
> - IORING_SETUP_CQE_MIXED)
> + IORING_SETUP_CQE_MIXED |\
> + IORING_SETUP_SQTHREAD_STATS)
>
> #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
> IORING_ENTER_SQ_WAKEUP |\
> diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
> index a3f11349ce06..46bcd4854abc 100644
> --- a/io_uring/sqpoll.c
> +++ b/io_uring/sqpoll.c
> @@ -161,6 +161,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
> mutex_init(&sqd->lock);
> init_waitqueue_head(&sqd->wait);
> init_completion(&sqd->exited);
> + sqd->enable_work_time_stat = false;
> return sqd;
> }
>
> @@ -317,7 +318,8 @@ static int io_sq_thread(void *data)
> }
>
> cap_entries = !list_is_singular(&sqd->ctx_list);
> - getrusage(current, RUSAGE_SELF, &start);
> + if (sqd->enable_work_time_stat)
> + getrusage(current, RUSAGE_SELF, &start);
> list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
> int ret = __io_sq_thread(ctx, cap_entries);
>
> @@ -333,7 +335,8 @@ static int io_sq_thread(void *data)
>
> if (sqt_spin || !time_after(jiffies, timeout)) {
> if (sqt_spin) {
> - io_sq_update_worktime(sqd, &start);
> + if (sqd->enable_work_time_stat)
> + io_sq_update_worktime(sqd, &start);
> timeout = jiffies + sqd->sq_thread_idle;
> }
> if (unlikely(need_resched())) {
> @@ -445,7 +448,8 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
> ret = PTR_ERR(sqd);
> goto err;
> }
> -
> + if (ctx->flags & IORING_SETUP_SQTHREAD_STATS)
> + sqd->enable_work_time_stat = true;
> ctx->sq_creds = get_current_cred();
> ctx->sq_data = sqd;
> ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
> diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
> index b83dcdec9765..55f2e4d46d54 100644
> --- a/io_uring/sqpoll.h
> +++ b/io_uring/sqpoll.h
> @@ -19,6 +19,7 @@ struct io_sq_data {
> u64 work_time;
> unsigned long state;
> struct completion exited;
> + bool enable_work_time_stat;
> };
>
> int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
--
Gabriel Krisman Bertazi
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection
2025-10-20 11:30 [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection Fengnan Chang
2025-10-20 14:59 ` Gabriel Krisman Bertazi
@ 2025-10-20 15:12 ` Jens Axboe
2025-10-21 8:54 ` [External] " Fengnan Chang
1 sibling, 1 reply; 6+ messages in thread
From: Jens Axboe @ 2025-10-20 15:12 UTC (permalink / raw)
To: Fengnan Chang, xiaobing.li, asml.silence, io-uring; +Cc: Diangang Li
On 10/20/25 5:30 AM, Fengnan Chang wrote:
> In previous versions, getrusage was always called in sqrthread
> to count work time, but this could incur some overhead.
> This patch turn off stats by default, and introduces a new flag
> IORING_SETUP_SQTHREAD_STATS that allows user to enable the
> collection of statistics in the sqthread.
>
> ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 ./testfile
> IOPS base: 570K, patch: 590K
>
> ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 /dev/nvme1n1
> IOPS base: 826K, patch: 889K
That's a crazy amount of overhead indeed. I'm assuming this is
because the task has lots of threads? And/or going through the retry
a lot? Might make more sense to have a cheaper and more rough
getrusage() instead? All we really use is ru_stime.{tv_sec,tv_nsec}.
Should it be using RUSAGE_THREAD? Correct me if I'm wrong, but using
PTHREAD_SELF actually seems wrong as-is.
In any case, I don't think a setup flag is the right choice here. That
space is fairly limited, and SQPOLL is also a bit of an esoteric
feature. Hence not sure a setup flag is the right approach. Would
probably make more sense to have a REGISTER opcode to get/set various
features like this, I'm sure it's not the last thing like this we'll run
into. But as mentioned, I do think just having a more light weight
getrusage would perhaps be prudent.
--
Jens Axboe
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [External] Re: [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection
2025-10-20 14:59 ` Gabriel Krisman Bertazi
@ 2025-10-21 8:50 ` Fengnan Chang
0 siblings, 0 replies; 6+ messages in thread
From: Fengnan Chang @ 2025-10-21 8:50 UTC (permalink / raw)
To: Gabriel Krisman Bertazi
Cc: axboe, xiaobing.li, asml.silence, io-uring, Diangang Li
Gabriel Krisman Bertazi <krisman@suse.de> 于2025年10月20日周一 22:59写道:
>
> Fengnan Chang <changfengnan@bytedance.com> writes:
>
> > In previous versions, getrusage was always called in sqrthread
> > to count work time, but this could incur some overhead.
> > This patch turn off stats by default, and introduces a new flag
> > IORING_SETUP_SQTHREAD_STATS that allows user to enable the
> > collection of statistics in the sqthread.
> >
> > ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 ./testfile
> > IOPS base: 570K, patch: 590K
> >
> > ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 /dev/nvme1n1
> > IOPS base: 826K, patch: 889K
> >
> > Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
> > Reviewed-by: Diangang Li <lidiangang@bytedance.com>
> > ---
> > include/uapi/linux/io_uring.h | 5 +++++
> > io_uring/fdinfo.c | 15 ++++++++++-----
> > io_uring/io_uring.h | 3 ++-
> > io_uring/sqpoll.c | 10 +++++++---
> > io_uring/sqpoll.h | 1 +
> > 5 files changed, 25 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > index 263bed13473e..8c5cb9533950 100644
> > --- a/include/uapi/linux/io_uring.h
> > +++ b/include/uapi/linux/io_uring.h
> > @@ -231,6 +231,11 @@ enum io_uring_sqe_flags_bit {
> > */
> > #define IORING_SETUP_CQE_MIXED (1U << 18)
> >
> > +/*
> > + * Enable SQPOLL thread stats collection
> > + */
> > +#define IORING_SETUP_SQTHREAD_STATS (1U << 19)
> > +
> > enum io_uring_op {
> > IORING_OP_NOP,
> > IORING_OP_READV,
> > diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
> > index ff3364531c77..4c532e414255 100644
> > --- a/io_uring/fdinfo.c
> > +++ b/io_uring/fdinfo.c
> > @@ -154,13 +154,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> > if (tsk) {
> > get_task_struct(tsk);
> > rcu_read_unlock();
> > - getrusage(tsk, RUSAGE_SELF, &sq_usage);
> > + if (sq->enable_work_time_stat)
> > + getrusage(tsk, RUSAGE_SELF, &sq_usage);
> > put_task_struct(tsk);
>
> If the usage statistics are disabled, you don't need to acquire and drop
> the task_struct reference any longer. you can move the get/put_task_struct
> into the if.
I'll fix this in next version.
>
> > sq_pid = sq->task_pid;
> > sq_cpu = sq->sq_cpu;
> > - sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
> > + if (sq->enable_work_time_stat) {
> > + sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
> > + sq_usage.ru_stime.tv_usec);
> > - sq_work_time = sq->work_time;
> > + sq_work_time = sq->work_time;
> > + }
> > } else {
> > rcu_read_unlock();
> > }
> > @@ -168,8 +171,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
> >
> > seq_printf(m, "SqThread:\t%d\n", sq_pid);
> > seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
> > - seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
> > - seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
> > + if (ctx->flags & IORING_SETUP_SQTHREAD_STATS) {
> > + seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
> > + seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
> > + }
>
> It works, but it is weird that you gate the writing of sq_total_time on
> (sq->enable_work_time_stat) and then, the display of it on (ctx->flags &
> IORING_SETUP_SQTHREAD_STATS). Since a sqpoll can attend to more than
> one ctx, I'd just check ctx->flags & IORING_SETUP_SQTHREAD_STATS
> in both places in this function.
Yeh, there's a bit of a problem here, I was going to use check
ctx->flags & IORING_SETUP_SQTHREAD_STATS, but in io_sq_thread, getrusage
is try to account multi ctx, and can't get ctx before getrusage, so I use
enable_work_time_stat, and I missed something. I would like to use
enable_work_time_stat in both places in this function.
>
> > seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
> > for (i = 0; i < ctx->file_table.data.nr; i++) {
> > struct file *f = NULL;
> > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> > index 46d9141d772a..949dc7cba111 100644
> > --- a/io_uring/io_uring.h
> > +++ b/io_uring/io_uring.h
> > @@ -54,7 +54,8 @@
> > IORING_SETUP_REGISTERED_FD_ONLY |\
> > IORING_SETUP_NO_SQARRAY |\
> > IORING_SETUP_HYBRID_IOPOLL |\
> > - IORING_SETUP_CQE_MIXED)
> > + IORING_SETUP_CQE_MIXED |\
> > + IORING_SETUP_SQTHREAD_STATS)
> >
> > #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
> > IORING_ENTER_SQ_WAKEUP |\
> > diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
> > index a3f11349ce06..46bcd4854abc 100644
> > --- a/io_uring/sqpoll.c
> > +++ b/io_uring/sqpoll.c
> > @@ -161,6 +161,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
> > mutex_init(&sqd->lock);
> > init_waitqueue_head(&sqd->wait);
> > init_completion(&sqd->exited);
> > + sqd->enable_work_time_stat = false;
> > return sqd;
> > }
> >
> > @@ -317,7 +318,8 @@ static int io_sq_thread(void *data)
> > }
> >
> > cap_entries = !list_is_singular(&sqd->ctx_list);
> > - getrusage(current, RUSAGE_SELF, &start);
> > + if (sqd->enable_work_time_stat)
> > + getrusage(current, RUSAGE_SELF, &start);
> > list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
> > int ret = __io_sq_thread(ctx, cap_entries);
> >
> > @@ -333,7 +335,8 @@ static int io_sq_thread(void *data)
> >
> > if (sqt_spin || !time_after(jiffies, timeout)) {
> > if (sqt_spin) {
> > - io_sq_update_worktime(sqd, &start);
> > + if (sqd->enable_work_time_stat)
> > + io_sq_update_worktime(sqd, &start);
> > timeout = jiffies + sqd->sq_thread_idle;
> > }
> > if (unlikely(need_resched())) {
> > @@ -445,7 +448,8 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
> > ret = PTR_ERR(sqd);
> > goto err;
> > }
> > -
> > + if (ctx->flags & IORING_SETUP_SQTHREAD_STATS)
> > + sqd->enable_work_time_stat = true;
> > ctx->sq_creds = get_current_cred();
> > ctx->sq_data = sqd;
> > ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
> > diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
> > index b83dcdec9765..55f2e4d46d54 100644
> > --- a/io_uring/sqpoll.h
> > +++ b/io_uring/sqpoll.h
> > @@ -19,6 +19,7 @@ struct io_sq_data {
> > u64 work_time;
> > unsigned long state;
> > struct completion exited;
> > + bool enable_work_time_stat;
> > };
> >
> > int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
>
> --
> Gabriel Krisman Bertazi
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [External] Re: [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection
2025-10-20 15:12 ` Jens Axboe
@ 2025-10-21 8:54 ` Fengnan Chang
2025-10-21 17:54 ` Jens Axboe
0 siblings, 1 reply; 6+ messages in thread
From: Fengnan Chang @ 2025-10-21 8:54 UTC (permalink / raw)
To: Jens Axboe; +Cc: xiaobing.li, asml.silence, io-uring, Diangang Li
Jens Axboe <axboe@kernel.dk> 于2025年10月20日周一 23:12写道:
>
> On 10/20/25 5:30 AM, Fengnan Chang wrote:
> > In previous versions, getrusage was always called in sqrthread
> > to count work time, but this could incur some overhead.
> > This patch turn off stats by default, and introduces a new flag
> > IORING_SETUP_SQTHREAD_STATS that allows user to enable the
> > collection of statistics in the sqthread.
> >
> > ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 ./testfile
> > IOPS base: 570K, patch: 590K
> >
> > ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 /dev/nvme1n1
> > IOPS base: 826K, patch: 889K
>
> That's a crazy amount of overhead indeed. I'm assuming this is
> because the task has lots of threads? And/or going through the retry
> a lot? Might make more sense to have a cheaper and more rough
> getrusage() instead? All we really use is ru_stime.{tv_sec,tv_nsec}.
> Should it be using RUSAGE_THREAD? Correct me if I'm wrong, but using
> PTHREAD_SELF actually seems wrong as-is.
Only one sqpoll thread, no retry. Only enable sq_thread_poll by default in
./t/io_uring.c to test.
Yeh, getrusage is not correct, I'll try to find a cheaper way.
>
> In any case, I don't think a setup flag is the right choice here. That
> space is fairly limited, and SQPOLL is also a bit of an esoteric
> feature. Hence not sure a setup flag is the right approach. Would
> probably make more sense to have a REGISTER opcode to get/set various
> features like this, I'm sure it's not the last thing like this we'll run
> into. But as mentioned, I do think just having a more light weight
> getrusage would perhaps be prudent.
Get your point, I'll make it in REGISTER opcode.
>
> --
> Jens Axboe
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [External] Re: [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection
2025-10-21 8:54 ` [External] " Fengnan Chang
@ 2025-10-21 17:54 ` Jens Axboe
0 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2025-10-21 17:54 UTC (permalink / raw)
To: Fengnan Chang; +Cc: xiaobing.li, asml.silence, io-uring, Diangang Li
[-- Attachment #1: Type: text/plain, Size: 2275 bytes --]
On 10/21/25 2:54 AM, Fengnan Chang wrote:
> Jens Axboe <axboe@kernel.dk> ?2025?10?20??? 23:12???
>>
>> On 10/20/25 5:30 AM, Fengnan Chang wrote:
>>> In previous versions, getrusage was always called in sqrthread
>>> to count work time, but this could incur some overhead.
>>> This patch turn off stats by default, and introduces a new flag
>>> IORING_SETUP_SQTHREAD_STATS that allows user to enable the
>>> collection of statistics in the sqthread.
>>>
>>> ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 ./testfile
>>> IOPS base: 570K, patch: 590K
>>>
>>> ./t/io_uring -p1 -d128 -b4096 -s32 -c1 -F1 -B1 -R1 -X1 -n1 /dev/nvme1n1
>>> IOPS base: 826K, patch: 889K
>>
>> That's a crazy amount of overhead indeed. I'm assuming this is
>> because the task has lots of threads? And/or going through the retry
>> a lot? Might make more sense to have a cheaper and more rough
>> getrusage() instead? All we really use is ru_stime.{tv_sec,tv_nsec}.
>> Should it be using RUSAGE_THREAD? Correct me if I'm wrong, but using
>> PTHREAD_SELF actually seems wrong as-is.
>
> Only one sqpoll thread, no retry. Only enable sq_thread_poll by default in
> ./t/io_uring.c to test.
> Yeh, getrusage is not correct, I'll try to find a cheaper way.
>
>>
>> In any case, I don't think a setup flag is the right choice here. That
>> space is fairly limited, and SQPOLL is also a bit of an esoteric
>> feature. Hence not sure a setup flag is the right approach. Would
>> probably make more sense to have a REGISTER opcode to get/set various
>> features like this, I'm sure it's not the last thing like this we'll run
>> into. But as mentioned, I do think just having a more light weight
>> getrusage would perhaps be prudent.
> Get your point, I'll make it in REGISTER opcode.
My main point was actually "perhaps we're doing something stupid, maybe
let's try and do the accounting in a less stupid way". For example,
right now it's just blindly getting the time all of the time. That seems
very stupid. And rather than add a knob for this, perhaps we can just
fix it?
I took a quick stab at it, see the attached two patches. I'll send them
out as well. With those, accounting should still be fine, but all the
overhead is essentially gone. Hence no need for a knob...
--
Jens Axboe
[-- Attachment #2: 0002-io_uring-sqpoll-be-smarter-on-when-to-update-the-sti.patch --]
[-- Type: text/x-patch, Size: 5393 bytes --]
From ad005f741645fc7da5d16ac0768419bb3393e22b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 21 Oct 2025 11:44:39 -0600
Subject: [PATCH 2/2] io_uring/sqpoll: be smarter on when to update the stime
usage
The current approach is a bit naive, and hence calls the time querying
way too often. Only start the "doing work" timer when there's actual
work to do, and then use that information to terminate (and account) the
work time once done. This greatly reduces the frequency of these calls,
when they cannot have changed anyway.
Running a basic random reader that is setup to use SQPOLL, a profile
before this change shows these as the top cycle consumers:
+ 32.60% iou-sqp-1074 [kernel.kallsyms] [k] thread_group_cputime_adjusted
+ 19.97% iou-sqp-1074 [kernel.kallsyms] [k] thread_group_cputime
+ 12.20% io_uring io_uring [.] submitter_uring_fn
+ 4.13% iou-sqp-1074 [kernel.kallsyms] [k] getrusage
+ 2.45% iou-sqp-1074 [kernel.kallsyms] [k] io_submit_sqes
+ 2.18% iou-sqp-1074 [kernel.kallsyms] [k] __pi_memset_generic
+ 2.09% iou-sqp-1074 [kernel.kallsyms] [k] cputime_adjust
and after this change, top of profile looks as follows:
+ 36.23% io_uring io_uring [.] submitter_uring_fn
+ 23.26% iou-sqp-819 [kernel.kallsyms] [k] io_sq_thread
+ 10.14% iou-sqp-819 [kernel.kallsyms] [k] io_sq_tw
+ 6.52% iou-sqp-819 [kernel.kallsyms] [k] tctx_task_work_run
+ 4.82% iou-sqp-819 [kernel.kallsyms] [k] nvme_submit_cmds.part.0
+ 2.91% iou-sqp-819 [kernel.kallsyms] [k] io_submit_sqes
[...]
0.02% iou-sqp-819 [kernel.kallsyms] [k] cputime_adjust ▒
where it's spending the cycles on things that actually matter.
Reported-by: Fengnan Chang <changfengnan@bytedance.com>
Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/sqpoll.c | 43 ++++++++++++++++++++++++++++++++-----------
1 file changed, 32 insertions(+), 11 deletions(-)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 8705b0aa82e0..f6916f46c047 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -170,6 +170,11 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
+struct io_sq_time {
+ bool started;
+ struct timespec64 ts;
+};
+
struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
{
u64 utime, stime;
@@ -178,15 +183,27 @@ struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
return ns_to_timespec64(stime);
}
-static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist)
{
struct timespec64 ts;
- ts = timespec64_sub(io_sq_cpu_time(current), start);
+ if (!ist->started)
+ return;
+ ist->started = false;
+ ts = timespec64_sub(io_sq_cpu_time(current), ist->ts);
sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
}
-static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
+static void io_sq_start_worktime(struct io_sq_time *ist)
+{
+ if (ist->started)
+ return;
+ ist->started = true;
+ ist->ts = io_sq_cpu_time(current);
+}
+
+static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd,
+ bool cap_entries, struct io_sq_time *ist)
{
unsigned int to_submit;
int ret = 0;
@@ -199,6 +216,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
const struct cred *creds = NULL;
+ io_sq_start_worktime(ist);
+
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
@@ -277,7 +296,6 @@ static int io_sq_thread(void *data)
struct llist_node *retry_list = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- struct timespec64 start;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN] = {};
DEFINE_WAIT(wait);
@@ -315,6 +333,7 @@ static int io_sq_thread(void *data)
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
+ struct io_sq_time ist = { };
if (io_sqd_events_pending(sqd) || signal_pending(current)) {
if (io_sqd_handle_event(sqd))
@@ -323,9 +342,8 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
- start = io_sq_cpu_time(current);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- int ret = __io_sq_thread(ctx, cap_entries);
+ int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist);
if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
@@ -333,15 +351,18 @@ static int io_sq_thread(void *data)
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
sqt_spin = true;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- if (io_napi(ctx))
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (io_napi(ctx)) {
+ io_sq_start_worktime(&ist);
io_napi_sqpoll_busy_poll(ctx);
+ }
+ }
+
+ io_sq_update_worktime(sqd, &ist);
if (sqt_spin || !time_after(jiffies, timeout)) {
- if (sqt_spin) {
- io_sq_update_worktime(sqd, start);
+ if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
- }
if (unlikely(need_resched())) {
mutex_unlock(&sqd->lock);
cond_resched();
--
2.51.0
[-- Attachment #3: 0001-io_uring-sqpoll-switch-away-from-getrusage-for-CPU-a.patch --]
[-- Type: text/x-patch, Size: 4724 bytes --]
From 4052e20d3d63dac5e05135854e30c6b5f37fdd4a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 21 Oct 2025 07:16:08 -0600
Subject: [PATCH 1/2] io_uring/sqpoll: switch away from getrusage() for CPU
accounting
getrusage() does a lot more than what the SQPOLL accounting needs, the
latter only cares about (and uses) the stime. Rather than do a full
RUSAGE_SELF summation, just query the used stime instead.
Cc: stable@vger.kernel.org
Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/fdinfo.c | 9 +++++----
io_uring/sqpoll.c | 34 ++++++++++++++++++++--------------
io_uring/sqpoll.h | 1 +
3 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index ff3364531c77..966e06b078f6 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
- struct rusage sq_usage;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
unsigned int sq_head = READ_ONCE(r->sq.head);
unsigned int sq_tail = READ_ONCE(r->sq.tail);
@@ -152,14 +151,16 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
* thread termination.
*/
if (tsk) {
+ struct timespec64 ts;
+
get_task_struct(tsk);
rcu_read_unlock();
- getrusage(tsk, RUSAGE_SELF, &sq_usage);
+ ts = io_sq_cpu_time(tsk);
put_task_struct(tsk);
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
- sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
- + sq_usage.ru_stime.tv_usec);
+ sq_total_time = (ts.tv_sec * 1000000
+ + ts.tv_nsec / 1000);
sq_work_time = sq->work_time;
} else {
rcu_read_unlock();
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a3f11349ce06..8705b0aa82e0 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -11,6 +11,7 @@
#include <linux/audit.h>
#include <linux/security.h>
#include <linux/cpuset.h>
+#include <linux/sched/cputime.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -169,6 +170,22 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
+struct timespec64 io_sq_cpu_time(struct task_struct *tsk)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(tsk, &utime, &stime);
+ return ns_to_timespec64(stime);
+}
+
+static void io_sq_update_worktime(struct io_sq_data *sqd, struct timespec64 start)
+{
+ struct timespec64 ts;
+
+ ts = timespec64_sub(io_sq_cpu_time(current), start);
+ sqd->work_time += ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
+}
+
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
unsigned int to_submit;
@@ -255,23 +272,12 @@ static bool io_sq_tw_pending(struct llist_node *retry_list)
return retry_list || !llist_empty(&tctx->task_list);
}
-static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
-{
- struct rusage end;
-
- getrusage(current, RUSAGE_SELF, &end);
- end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
- end.ru_stime.tv_usec -= start->ru_stime.tv_usec;
-
- sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
-}
-
static int io_sq_thread(void *data)
{
struct llist_node *retry_list = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- struct rusage start;
+ struct timespec64 start;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN] = {};
DEFINE_WAIT(wait);
@@ -317,7 +323,7 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
- getrusage(current, RUSAGE_SELF, &start);
+ start = io_sq_cpu_time(current);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
@@ -333,7 +339,7 @@ static int io_sq_thread(void *data)
if (sqt_spin || !time_after(jiffies, timeout)) {
if (sqt_spin) {
- io_sq_update_worktime(sqd, &start);
+ io_sq_update_worktime(sqd, start);
timeout = jiffies + sqd->sq_thread_idle;
}
if (unlikely(need_resched())) {
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index b83dcdec9765..84ed2b312e88 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -29,6 +29,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+struct timespec64 io_sq_cpu_time(struct task_struct *tsk);
static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
{
--
2.51.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-10-21 17:54 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-20 11:30 [PATCH v2] io_uring: add IORING_SETUP_SQTHREAD_STATS flag to enable sqthread stats collection Fengnan Chang
2025-10-20 14:59 ` Gabriel Krisman Bertazi
2025-10-21 8:50 ` [External] " Fengnan Chang
2025-10-20 15:12 ` Jens Axboe
2025-10-21 8:54 ` [External] " Fengnan Chang
2025-10-21 17:54 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox