* [PATCH v2] io_uring: enable toggle of iowait usage when waiting on CQEs
@ 2025-03-14 18:48 Jens Axboe
2025-03-16 6:57 ` Pavel Begunkov
0 siblings, 1 reply; 2+ messages in thread
From: Jens Axboe @ 2025-03-14 18:48 UTC (permalink / raw)
To: io-uring
By default, io_uring marks a waiting task as being in iowait, if it's
sleeping waiting on events and there are pending requests. This isn't
necessarily always useful, and may be confusing on non-storage setups
where iowait isn't expected. It can also cause extra power usage, by
preventing the CPU from entering lower sleep states.
This adds a new enter flag, IORING_ENTER_NO_IOWAIT. If set, then
io_uring will not mark the sleeping task as being in iowait. If the
kernel support this feature, then it will be marked by having the
IORING_FEAT_NO_IOWAIT feature flag set.
Signed-off-by: Jens Axboe <[email protected]>
---
Basic liburing support and a test case here:
https://git.kernel.dk/cgit/liburing/log/?h=iowait
Since v1:
- Add IORING_ENTER_NO_IOWAIT feature flag
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 050fa8eb2e8f..0d6c83c8d1cf 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -541,6 +541,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
#define IORING_ENTER_EXT_ARG_REG (1U << 6)
+#define IORING_ENTER_NO_IOWAIT (1U << 7)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -578,6 +579,7 @@ struct io_uring_params {
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
#define IORING_FEAT_RW_ATTR (1U << 16)
+#define IORING_FEAT_NO_IOWAIT (1U << 17)
/*
* io_uring_register(2) opcodes and arguments
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 58003fa6b327..d975e68e91f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2485,8 +2485,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
}
+struct ext_arg {
+ size_t argsz;
+ struct timespec64 ts;
+ const sigset_t __user *sig;
+ ktime_t min_time;
+ bool ts_set;
+ bool iowait;
+};
+
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
+ struct ext_arg *ext_arg,
ktime_t start_time)
{
int ret = 0;
@@ -2496,7 +2506,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
- if (current_pending_io())
+ if (ext_arg->iowait && current_pending_io())
current->in_iowait = 1;
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
@@ -2509,6 +2519,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
/* If this returns > 0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
+ struct ext_arg *ext_arg,
ktime_t start_time)
{
if (unlikely(READ_ONCE(ctx->check_cq)))
@@ -2522,17 +2533,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (unlikely(io_should_wake(iowq)))
return 0;
- return __io_cqring_wait_schedule(ctx, iowq, start_time);
+ return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time);
}
-struct ext_arg {
- size_t argsz;
- struct timespec64 ts;
- const sigset_t __user *sig;
- ktime_t min_time;
- bool ts_set;
-};
-
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -2610,7 +2613,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
TASK_INTERRUPTIBLE);
}
- ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
+ ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time);
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -3261,6 +3264,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
const struct io_uring_getevents_arg __user *uarg = argp;
struct io_uring_getevents_arg arg;
+ ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT);
+
/*
* If EXT_ARG isn't set, then we have no timespec and the argp pointer
* is just a pointer to the sigset_t.
@@ -3338,7 +3343,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING |
IORING_ENTER_ABS_TIMER |
- IORING_ENTER_EXT_ARG_REG)))
+ IORING_ENTER_EXT_ARG_REG |
+ IORING_ENTER_NO_IOWAIT)))
return -EINVAL;
/*
@@ -3752,7 +3758,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
- IORING_FEAT_RW_ATTR;
+ IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
--
Jens Axboe
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v2] io_uring: enable toggle of iowait usage when waiting on CQEs
2025-03-14 18:48 [PATCH v2] io_uring: enable toggle of iowait usage when waiting on CQEs Jens Axboe
@ 2025-03-16 6:57 ` Pavel Begunkov
0 siblings, 0 replies; 2+ messages in thread
From: Pavel Begunkov @ 2025-03-16 6:57 UTC (permalink / raw)
To: Jens Axboe, io-uring
On 3/14/25 18:48, Jens Axboe wrote:
> By default, io_uring marks a waiting task as being in iowait, if it's
> sleeping waiting on events and there are pending requests. This isn't
> necessarily always useful, and may be confusing on non-storage setups
> where iowait isn't expected. It can also cause extra power usage, by
I think this passage hints on controlling iowait stats, and in my opinion
we shouldn't conflate stats and optimisations. Global iowait stats
is there to stay, but ideally we want to never account io_uring as iowait.
That's while there were talks about removing optimisation toggle at all
(and do it as internal cpufreq magic, I suppose).
How about posing it as an optimisation option only and that iowait stat
is a side effect that can change. Explicitly spelling that in the commit
message and in a comment on top of the flag in an attempt to avoid the
uapi regression trap. We'd also need it in the option's man when it's
written. And I'd also add "hint" to the flag name, like
IORING_ENTER_HINT_NO_IOWAIT, as we might need to nop it if anything
changes on the cpufreq side.
--
Pavel Begunkov
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-03-16 6:56 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-14 18:48 [PATCH v2] io_uring: enable toggle of iowait usage when waiting on CQEs Jens Axboe
2025-03-16 6:57 ` Pavel Begunkov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox