From: hexue <[email protected]>
To: [email protected], [email protected]
Cc: [email protected], [email protected],
hexue <[email protected]>
Subject: [PATCH v7 RESENT] io_uring: releasing CPU resources when polling
Date: Thu, 8 Aug 2024 15:17:12 +0800 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: CGME20240808071720epcas5p3f6f4f8abc6d4c02523dd4f64153a7cec@epcas5p3.samsung.com
This patch add a new hybrid poll at io_uring level, it also set a signal
"IORING_SETUP_HY_POLL" to application, aim to provide a interface for users
to enable use new hybrid polling flexibly.
io_uring use polling mode could improve the IO performence, but it will
spend 100% of CPU resources to do polling.
A new hybrid poll is implemented on the io_uring layer. Once IO issued,
it will not polling immediately, but block first and re-run before IO
complete, then poll to reap IO. This poll function could be a suboptimal
solution when running on a single thread, it offers the performance lower
than regular polling but higher than IRQ, and CPU utilization is also lower
than polling.
Test Result
fio-3.35, 16 poll queues, 1 thread
-------------------------------------------------------------------------
Performance
-------------------------------------------------------------------------
write read randwrite randread
regular poll BW=3939MiB/s BW=6613MiB/s IOPS=190K IOPS=470K
IRQ BW=3927MiB/s BW=6612MiB/s IOPS=181K IOPS=203K
hybrid poll BW=3937MiB/s BW=6623MiB/s IOPS=190K IOPS=358K(suboptimal)
-------------------------------------------------------------------------
CPU Utilization
------------------------------------------------------
write read randwrite randread
regular poll 100% 100% 100% 100%
IRQ 50% 53% 100% 100%
hybrid poll 70% 37% 70% 85%
------------------------------------------------------
--
changes of RESENT:
- rebase code on for-6.12/io_uring
changes since v6:
- Modified IO path, distinct iopoll and uring_cmd_iopoll
- update test results
changes since v5:
- Remove cstime recorder
- Use minimize sleep time in different drivers
- Use the half of whole runtime to do schedule
- Consider as a suboptimal solution between
regular poll and IRQ
changes since v4:
- Rewrote the commit
- Update the test results
- Reorganized the code basd on 6.11
changes since v3:
- Simplified the commit
- Add some comments on code
changes since v2:
- Modified some formatting errors
- Move judgement to poll path
changes since v1:
- Extend hybrid poll to async polled io
Signed-off-by: hexue <[email protected]>
---
include/linux/io_uring_types.h | 6 ++
include/uapi/linux/io_uring.h | 1 +
io_uring/io_uring.c | 3 +-
io_uring/rw.c | 100 +++++++++++++++++++++++++++++----
4 files changed, 99 insertions(+), 11 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3315005df117..35ac4a8bf6ab 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -422,6 +422,8 @@ struct io_ring_ctx {
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
+ /* for io_uring hybrid poll*/
+ u64 available_time;
};
struct io_tw_state {
@@ -657,6 +659,10 @@ struct io_kiocb {
u64 extra1;
u64 extra2;
} big_cqe;
+ /* for io_uring hybrid iopoll */
+ bool poll_state;
+ u64 iopoll_start;
+ u64 iopoll_end;
};
struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2aaf7ee256ac..42ae868651b0 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -199,6 +199,7 @@ enum io_uring_sqe_flags_bit {
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
+#define IORING_SETUP_HY_POLL (1U << 17)
enum io_uring_op {
IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3942db160f18..bb3dfd749572 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -301,6 +301,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ ctx->available_time = LLONG_MAX;
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
@@ -3603,7 +3604,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY))
+ IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
return -EINVAL;
return io_uring_create(entries, &p, params);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c004d21e2f12..eb9791b50d36 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -772,6 +772,13 @@ static bool need_complete_io(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}
+static void init_hybrid_poll(struct io_kiocb *req)
+{
+ /* make sure every req only block once*/
+ req->poll_state = false;
+ req->iopoll_start = ktime_get_ns();
+}
+
static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -809,6 +816,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
+ if (ctx->flags & IORING_SETUP_HY_POLL)
+ init_hybrid_poll(req);
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -1105,6 +1114,83 @@ void io_rw_fail(struct io_kiocb *req)
io_req_set_res(req, res, req->cqe.flags);
}
+static int io_uring_classic_poll(struct io_kiocb *req,
+ struct io_comp_batch *iob, unsigned int poll_flags)
+{
+ int ret;
+ struct file *file = req->file;
+
+ if (req->opcode == IORING_OP_URING_CMD) {
+ struct io_uring_cmd *ioucmd;
+
+ ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ ret = file->f_op->uring_cmd_iopoll(ioucmd, iob,
+ poll_flags);
+ } else {
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
+ ret = file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+ }
+ return ret;
+}
+
+static u64 io_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct hrtimer_sleeper timer;
+ enum hrtimer_mode mode;
+ ktime_t kt;
+ u64 sleep_time;
+
+ if (req->poll_state)
+ return 0;
+
+ if (ctx->available_time == LLONG_MAX)
+ return 0;
+
+ /* Using half running time to do schedul */
+ sleep_time = ctx->available_time / 2;
+
+ kt = ktime_set(0, sleep_time);
+ req->poll_state = true;
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&timer.timer, kt);
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_sleeper_start_expires(&timer, mode);
+
+ if (timer.task)
+ io_schedule();
+
+ hrtimer_cancel(&timer.timer);
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&timer.timer);
+
+ return sleep_time;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+ struct io_comp_batch *iob, unsigned int poll_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+ u64 runtime, sleep_time;
+
+ sleep_time = io_delay(ctx, req);
+ ret = io_uring_classic_poll(req, iob, poll_flags);
+ req->iopoll_end = ktime_get_ns();
+ runtime = req->iopoll_end - req->iopoll_start - sleep_time;
+ if (runtime < 0)
+ return 0;
+
+ /* use minimize sleep time if there are different speed
+ * drivers, it could get more completions from fast one
+ */
+ if (ctx->available_time > runtime)
+ ctx->available_time = runtime;
+ return ret;
+}
+
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
struct io_wq_work_node *pos, *start, *prev;
@@ -1132,17 +1218,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (READ_ONCE(req->iopoll_completed))
break;
- if (req->opcode == IORING_OP_URING_CMD) {
- struct io_uring_cmd *ioucmd;
-
- ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
- poll_flags);
- } else {
- struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ if (ctx->flags & IORING_SETUP_HY_POLL)
+ ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+ else
+ ret = io_uring_classic_poll(req, &iob, poll_flags);
- ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
- }
if (unlikely(ret < 0))
return ret;
else if (ret)
--
2.40.1
next parent reply other threads:[~2024-08-08 7:32 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20240808071720epcas5p3f6f4f8abc6d4c02523dd4f64153a7cec@epcas5p3.samsung.com>
2024-08-08 7:17 ` hexue [this message]
2024-08-08 16:30 ` [PATCH v7 RESENT] io_uring: releasing CPU resources when polling kernel test robot
2024-08-09 5:17 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox