From: Jens Axboe <[email protected]>
To: io-uring <[email protected]>
Subject: [PATCH v2] io_uring: enable toggle of iowait usage when waiting on CQEs
Date: Fri, 14 Mar 2025 12:48:28 -0600 [thread overview]
Message-ID: <[email protected]> (raw)
By default, io_uring marks a waiting task as being in iowait, if it's
sleeping waiting on events and there are pending requests. This isn't
necessarily always useful, and may be confusing on non-storage setups
where iowait isn't expected. It can also cause extra power usage, by
preventing the CPU from entering lower sleep states.
This adds a new enter flag, IORING_ENTER_NO_IOWAIT. If set, then
io_uring will not mark the sleeping task as being in iowait. If the
kernel support this feature, then it will be marked by having the
IORING_FEAT_NO_IOWAIT feature flag set.
Signed-off-by: Jens Axboe <[email protected]>
---
Basic liburing support and a test case here:
https://git.kernel.dk/cgit/liburing/log/?h=iowait
Since v1:
- Add IORING_ENTER_NO_IOWAIT feature flag
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 050fa8eb2e8f..0d6c83c8d1cf 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -541,6 +541,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
#define IORING_ENTER_EXT_ARG_REG (1U << 6)
+#define IORING_ENTER_NO_IOWAIT (1U << 7)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -578,6 +579,7 @@ struct io_uring_params {
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
#define IORING_FEAT_RW_ATTR (1U << 16)
+#define IORING_FEAT_NO_IOWAIT (1U << 17)
/*
* io_uring_register(2) opcodes and arguments
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 58003fa6b327..d975e68e91f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2485,8 +2485,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
}
+struct ext_arg {
+ size_t argsz;
+ struct timespec64 ts;
+ const sigset_t __user *sig;
+ ktime_t min_time;
+ bool ts_set;
+ bool iowait;
+};
+
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
+ struct ext_arg *ext_arg,
ktime_t start_time)
{
int ret = 0;
@@ -2496,7 +2506,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
- if (current_pending_io())
+ if (ext_arg->iowait && current_pending_io())
current->in_iowait = 1;
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
@@ -2509,6 +2519,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
/* If this returns > 0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
+ struct ext_arg *ext_arg,
ktime_t start_time)
{
if (unlikely(READ_ONCE(ctx->check_cq)))
@@ -2522,17 +2533,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (unlikely(io_should_wake(iowq)))
return 0;
- return __io_cqring_wait_schedule(ctx, iowq, start_time);
+ return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time);
}
-struct ext_arg {
- size_t argsz;
- struct timespec64 ts;
- const sigset_t __user *sig;
- ktime_t min_time;
- bool ts_set;
-};
-
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -2610,7 +2613,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
TASK_INTERRUPTIBLE);
}
- ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
+ ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time);
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -3261,6 +3264,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
const struct io_uring_getevents_arg __user *uarg = argp;
struct io_uring_getevents_arg arg;
+ ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT);
+
/*
* If EXT_ARG isn't set, then we have no timespec and the argp pointer
* is just a pointer to the sigset_t.
@@ -3338,7 +3343,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING |
IORING_ENTER_ABS_TIMER |
- IORING_ENTER_EXT_ARG_REG)))
+ IORING_ENTER_EXT_ARG_REG |
+ IORING_ENTER_NO_IOWAIT)))
return -EINVAL;
/*
@@ -3752,7 +3758,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
- IORING_FEAT_RW_ATTR;
+ IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
--
Jens Axboe
reply other threads:[~2025-03-14 18:48 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox