From: David Wei <[email protected]>
To: [email protected]
Cc: Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
David Wei <[email protected]>
Subject: [PATCH liburing v2] Add io_uring_iowait_toggle()
Date: Fri, 16 Aug 2024 16:20:48 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
Add io_uring_iowait_toggle() helper function for the userspace liburing
side of IORING_ENTER_NO_IOWAIT flag added in io_uring for 6.12.
This function toggles whether a ring sets in_iowait when waiting for
completions. This is useful when waiting for multiple batched
completions using e.g. io_uring_submit_and_wait_timeout() and userspace
treats iowait time as CPU utilization.
It works by keeping an internal flag INT_FLAG_NO_IOWAIT, which if set
will set IORING_ENTER_NO_IOWAIT on every io_uring_enter().
Manpages are added/modified, a unit test is included, and io_uring.h is
synced with the kernel side.
Signed-off-by: David Wei <[email protected]>
---
v2:
- edit manpages
---
man/io_uring_enter.2 | 6 ++
man/io_uring_iowait_toggle.3 | 52 ++++++++++
src/include/liburing.h | 1 +
src/include/liburing/io_uring.h | 2 +
src/int_flags.h | 1 +
src/liburing.map | 2 +
src/queue.c | 2 +
src/setup.c | 12 +++
test/Makefile | 1 +
test/no-iowait.c | 162 ++++++++++++++++++++++++++++++++
10 files changed, 241 insertions(+)
create mode 100644 man/io_uring_iowait_toggle.3
create mode 100644 test/no-iowait.c
diff --git a/man/io_uring_enter.2 b/man/io_uring_enter.2
index 5e4121b..da9b870 100644
--- a/man/io_uring_enter.2
+++ b/man/io_uring_enter.2
@@ -104,6 +104,12 @@ If the ring file descriptor has been registered through use of
then setting this flag will tell the kernel that the
.I ring_fd
passed in is the registered ring offset rather than a normal file descriptor.
+.TP
+.B IORING_ENTER_NO_IOWAIT
+If this flag is set, then waiting on events will not be accounted as iowait for
+the task if
+.BR io_uring_enter (2)
+results in waiting.
.PP
.PP
diff --git a/man/io_uring_iowait_toggle.3 b/man/io_uring_iowait_toggle.3
new file mode 100644
index 0000000..41a6367
--- /dev/null
+++ b/man/io_uring_iowait_toggle.3
@@ -0,0 +1,52 @@
+.\" Copyright (C) 2024 David Wei <[email protected]>
+.\"
+.\" SPDX-License-Identifier: LGPL-2.0-or-later
+.\"
+.TH io_uring_iowait_toggle 3 "Aug 16, 2024" "liburing-2.8" "liburing Manual"
+.SH NAME
+io_uring_iowait_toggle \- toggle whether waiting for events is accounted as iowait
+.SH SYNOPSIS
+.nf
+.B #include <liburing.h>
+.PP
+.BI "int io_uring_iowait_toggle(struct io_uring *" ring ",
+.BI " bool " enabled ");"
+.BI "
+.fi
+.SH DESCRIPTION
+.PP
+The
+.BR io_uring_iowait_toggle (3)
+function toggles for a given
+.I ring
+whether waiting for events is accounted as iowait time for the task. When set
+to true, time spent waiting is accounted as iowait time; otherwise, it is
+accounted as idle time. The default behavior is to always account time waiting
+for events as iowait time.
+
+Setting in_iowait achieves two things:
+.TP
+.B 1. Account time spent waiting as iowait time
+.TP
+.B 2. Enable cpufreq optimizations, setting SCHED_CPUFREQ_IOWAIT on the rq
+.PP
+
+The accounting aspect is a relic from the days of uniprocessor systems, where
+iowait indicates that a task is blocked uninterruptibly waiting for IO and
+cannot perform other work. iowait with SMP systems is mostly a bogus
+accounting value, but is set to enable cpufreq boosts for high frequency waits.
+
+Some user tooling attributes iowait time as CPU utilization time, so high
+iowait time can look like apparent high CPU utilization, even though the task
+is not scheduled and the CPU is free to run other tasks.
+.BR io_uring_iowait_toggle (3)
+provides a way to disable this behavior where it makes sense to do so.
+
+Available since 6.12.
+
+.SH RETURN VALUE
+On success
+.BR io_uring_iowait_toggle (3)
+0. If the kernel does not support this feature, it returns
+.BR -EOPNOTSUPP
+.
diff --git a/src/include/liburing.h b/src/include/liburing.h
index 1092f3b..ddc154c 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -243,6 +243,7 @@ int io_uring_unregister_napi(struct io_uring *ring, struct io_uring_napi *napi);
int io_uring_get_events(struct io_uring *ring);
int io_uring_submit_and_get_events(struct io_uring *ring);
+int io_uring_iowait_toggle(struct io_uring *ring, bool enabled);
/*
* io_uring syscalls.
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 01c36a8..36cabc5 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -504,6 +504,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
+#define IORING_ENTER_NO_IOWAIT (1U << 6)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -539,6 +540,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
+#define IORING_FEAT_IOWAIT_TOGGLE (1U << 15)
/*
* io_uring_register(2) opcodes and arguments
diff --git a/src/int_flags.h b/src/int_flags.h
index 548dd10..80ad7cb 100644
--- a/src/int_flags.h
+++ b/src/int_flags.h
@@ -6,6 +6,7 @@ enum {
INT_FLAG_REG_RING = 1,
INT_FLAG_REG_REG_RING = 2,
INT_FLAG_APP_MEM = 4,
+ INT_FLAG_NO_IOWAIT = 8,
};
#endif
diff --git a/src/liburing.map b/src/liburing.map
index fa096bb..e64fe2d 100644
--- a/src/liburing.map
+++ b/src/liburing.map
@@ -97,4 +97,6 @@ LIBURING_2.7 {
} LIBURING_2.6;
LIBURING_2.8 {
+ global:
+ io_uring_iowait_toggle;
} LIBURING_2.7;
diff --git a/src/queue.c b/src/queue.c
index c436061..bd2e6af 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -110,6 +110,8 @@ static int _io_uring_get_cqe(struct io_uring *ring,
if (ring->int_flags & INT_FLAG_REG_RING)
flags |= IORING_ENTER_REGISTERED_RING;
+ if (ring->int_flags & INT_FLAG_NO_IOWAIT)
+ flags |= IORING_ENTER_NO_IOWAIT;
ret = __sys_io_uring_enter2(ring->enter_ring_fd, data->submit,
data->wait_nr, flags, data->arg,
data->sz);
diff --git a/src/setup.c b/src/setup.c
index 1997d25..2647363 100644
--- a/src/setup.c
+++ b/src/setup.c
@@ -687,3 +687,15 @@ int io_uring_free_buf_ring(struct io_uring *ring, struct io_uring_buf_ring *br,
__sys_munmap(br, nentries * sizeof(struct io_uring_buf));
return 0;
}
+
+int io_uring_iowait_toggle(struct io_uring *ring, bool enabled)
+{
+ if (!(ring->features & IORING_FEAT_IOWAIT_TOGGLE))
+ return -EOPNOTSUPP;
+
+ if (enabled)
+ ring->int_flags &= ~INT_FLAG_NO_IOWAIT;
+ else
+ ring->int_flags |= INT_FLAG_NO_IOWAIT;
+ return 0;
+}
diff --git a/test/Makefile b/test/Makefile
index 0538a75..d9a049c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -125,6 +125,7 @@ test_srcs := \
msg-ring-flags.c \
msg-ring-overflow.c \
multicqes_drain.c \
+ no-iowait.c \
no-mmap-inval.c \
nolibc.c \
nop-all-sizes.c \
diff --git a/test/no-iowait.c b/test/no-iowait.c
new file mode 100644
index 0000000..8e7cb6a
--- /dev/null
+++ b/test/no-iowait.c
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: test no iowait toggle
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>
+
+#include "helpers.h"
+#include "liburing.h"
+#include "../src/syscall.h"
+
+#define TIMEOUT_SEC 1
+#define PINNED_CPU 0
+
+static int pin_to_cpu()
+{
+ cpu_set_t set;
+
+ CPU_ZERO(&set);
+ CPU_SET(PINNED_CPU, &set);
+ if (sched_setaffinity(0, sizeof(cpu_set_t), &set) == -1)
+ return 1;
+
+ return 0;
+}
+
+static int get_iowait()
+{
+ FILE *fp;
+ char line[1024];
+ char cpu[10];
+ int sz;
+ unsigned long long user, nice, system, idle, iowait;
+
+ sz = snprintf(cpu, 10, "cpu%d", PINNED_CPU);
+ fp = fopen("/proc/stat", "r");
+ if (fp == NULL)
+ return -1;
+
+ while (fgets(line, sizeof(line), fp) != NULL) {
+ if (strncmp(line, cpu, sz) == 0) {
+ sscanf(line, "%*s %llu %llu %llu %llu %llu", &user,
+ &nice, &system, &idle, &iowait);
+ break;
+ }
+ }
+
+ fclose(fp);
+ return iowait;
+}
+
+static int test_iowait(struct io_uring *ring, bool enabled)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct __kernel_timespec ts;
+ int ret, iowait, exp;
+
+ ret = io_uring_iowait_toggle(ring, enabled);
+ if (ret == -EOPNOTSUPP)
+ return T_EXIT_SKIP;
+
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "%s: get sqe failed\n", __FUNCTION__);
+ return T_EXIT_FAIL;
+ }
+
+ ts.tv_sec = TIMEOUT_SEC;
+ ts.tv_nsec = 0;
+ io_uring_prep_timeout(sqe, &ts, 0, 0);
+
+ ret = io_uring_submit(ring);
+ if (ret <= 0) {
+ fprintf(stderr, "%s: sqe submit failed: %d\n", __FUNCTION__, ret);
+ return T_EXIT_FAIL;
+ }
+
+ iowait = get_iowait();
+ if (iowait < 0) {
+ fprintf(stderr, "%s: open /proc/stat failed\n", __FUNCTION__);
+ return T_EXIT_FAIL;
+ }
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret < 0) {
+ fprintf(stderr, "%s: wait completion %d\n", __FUNCTION__, ret);
+ return T_EXIT_FAIL;
+ }
+
+ ret = cqe->res;
+ io_uring_cqe_seen(ring, cqe);
+ if (ret != -ETIME) {
+ fprintf(stderr, "%s: Timeout: %s\n", __FUNCTION__, strerror(-ret));
+ return T_EXIT_FAIL;
+ }
+
+ ret = get_iowait();
+ if (ret < 0) {
+ fprintf(stderr, "%s: open /proc/stat failed\n", __FUNCTION__);
+ return T_EXIT_FAIL;
+ }
+ exp = ret - iowait;
+ if (enabled) {
+ if (exp >= (TIMEOUT_SEC * sysconf(_SC_CLK_TCK) * 11) / 10 ||
+ exp <= (TIMEOUT_SEC * sysconf(_SC_CLK_TCK) * 9) / 10)
+ return T_EXIT_FAIL;
+ } else {
+ if (exp >= sysconf(_SC_CLK_TCK) / 10)
+ return T_EXIT_FAIL;
+ }
+
+ return T_EXIT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ struct io_uring ring;
+ struct io_uring_params p = { };
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ ret = pin_to_cpu();
+ if (ret) {
+ fprintf(stderr, "pinning to cpu%d failed\n", PINNED_CPU);
+ return 1;
+ }
+
+ ret = io_uring_queue_init_params(8, &ring, &p);
+ if (ret) {
+ fprintf(stderr, "ring setup failed\n");
+ return 1;
+ }
+
+ ret = test_iowait(&ring, true);
+ if (ret == T_EXIT_SKIP)
+ return ret;
+ if (ret) {
+ fprintf(stderr, "test_iowait with iowait enabled failed\n");
+ return ret;
+ }
+
+ ret = test_iowait(&ring, false);
+ if (ret) {
+ fprintf(stderr, "test_iowait with iowait disabled failed\n");
+ return ret;
+ }
+
+ io_uring_queue_exit(&ring);
+ return 0;
+}
--
2.43.5
reply other threads:[~2024-08-16 23:21 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox