public inbox for [email protected]
 help / color / mirror / Atom feed
From: hexue <[email protected]>
To: [email protected]
Cc: [email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected],
	[email protected], hexue <[email protected]>
Subject: [PATCH v2] io_uring: releasing CPU resources when polling
Date: Thu, 18 Apr 2024 17:31:43 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: CGME20240418093150epcas5p31dc20cc737c72009265593f247e48262@epcas5p3.samsung.com

This patch is intended to release the CPU resources of io_uring in
polling mode. When IO is issued, the program immediately polls for
check completion, which is a waste of CPU resources when IO commands
are executed on the disk.

I add the hybrid polling feature in io_uring, enables polling to
release a portion of CPU resources without affecting block layer.

- Record the running time and context switching time of each
  IO, and use these time to determine whether a process continue
  to schedule.

- Adaptive adjustment to different devices. Due to the real-time
  nature of time recording, each device's IO processing speed is
  different, so the CPU optimization effect will vary.

- Set a interface (ctx->flag) enables application to choose whether
  or not to use this feature.

The CPU optimization in peak workload of patch is tested as follows:
  all CPU utilization of original polling is 100% for per CPU, after
  optimization, the CPU utilization drop a lot (per CPU);

   read(128k, QD64, 1Job)     37%   write(128k, QD64, 1Job)     40%
   randread(4k, QD64, 16Job)  52%   randwrite(4k, QD64, 16Job)  12%

  Compared to original polling, the optimised performance reduction
  with peak workload within 1%.

   read  0.29%     write  0.51%    randread  0.09%    randwrite  0%

Reviewed-by: KANCHAN JOSHI <[email protected]>
Signed-off-by: hexue <[email protected]>
---
 include/linux/io_uring_types.h | 10 +++++
 include/uapi/linux/io_uring.h  |  1 +
 io_uring/io_uring.c            | 28 +++++++++++++-
 io_uring/io_uring.h            |  2 +
 io_uring/rw.c                  | 69 ++++++++++++++++++++++++++++++++++
 5 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 854ad67a5f70..7607fd8de91c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -224,6 +224,11 @@ struct io_alloc_cache {
 	size_t			elem_size;
 };
 
+struct iopoll_info {
+	long		last_runtime;
+	long		last_irqtime;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -421,6 +426,7 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+	struct xarray		poll_array;
 };
 
 struct io_tw_state {
@@ -641,6 +647,10 @@ struct io_kiocb {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
+	/* for hybrid iopoll */
+	int				poll_flag;
+	struct timespec64		iopoll_start;
+	struct timespec64		iopoll_end;
 };
 
 struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7a673b52827b..0038cdfec18f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -198,6 +198,7 @@ enum {
  * Removes indirection through the SQ index array.
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
+#define IORING_SETUP_HY_POLL	(1U << 17)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cd9a137ad6ce..bfb94e975f97 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -79,6 +79,8 @@
 
 #include <uapi/linux/io_uring.h>
 
+#include <linux/time.h>
+#include <linux/timekeeping.h>
 #include "io-wq.h"
 
 #include "io_uring.h"
@@ -311,6 +313,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	xa_init(&ctx->poll_array);
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -1875,10 +1878,28 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
 	return !!req->file;
 }
 
+void init_hybrid_poll_info(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	u32 index;
+
+	index = req->file->f_inode->i_rdev;
+	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+
+	if (!entry) {
+		entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
+		entry->last_runtime = 0;
+		entry->last_irqtime = 0;
+		xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
+	}
+
+	ktime_get_ts64(&req->iopoll_start);
+}
+
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 {
 	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	const struct cred *creds = NULL;
+	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	if (unlikely(!io_assign_file(req, def, issue_flags)))
@@ -1890,6 +1911,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	if (!def->audit_skip)
 		audit_uring_entry(req->opcode);
 
+	if (ctx->flags & IORING_SETUP_HY_POLL)
+		init_hybrid_poll_info(ctx, req);
+
 	ret = def->issue(req, issue_flags);
 
 	if (!def->audit_skip)
@@ -2176,6 +2200,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->file = NULL;
 	req->rsrc_node = NULL;
 	req->task = current;
+	req->poll_flag = 0;
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {
 		req->opcode = 0;
@@ -2921,6 +2946,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	kfree(ctx->cancel_table_locked.hbs);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
+	xa_destroy(&ctx->poll_array);
 	kfree(ctx);
 }
 
@@ -4050,7 +4076,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d5495710c178..d5b175826adb 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -125,6 +125,8 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
 	__io_req_task_work_add(req, 0);
 }
 
+#define LEFT_TIME 1000
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d5e79d9bdc71..ac73121030ee 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1118,6 +1118,69 @@ void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+void io_delay(struct io_kiocb *req, struct iopoll_info *entry)
+{
+	struct hrtimer_sleeper timer;
+	ktime_t kt;
+	struct timespec64 tc, oldtc;
+	enum hrtimer_mode mode;
+	long sleep_ti;
+
+	if (req->poll_flag == 1)
+		return;
+
+	if (entry->last_runtime <= entry->last_irqtime)
+		return;
+
+	/*
+	 * Avoid excessive scheduling time affecting performance
+	 * by using only 25 per cent of the remaining time
+	 */
+	sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;
+	if (sleep_ti < LEFT_TIME)
+		return;
+
+	ktime_get_ts64(&oldtc);
+	kt = ktime_set(0, sleep_ti);
+	req->poll_flag = 1;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+	if (timer.task) {
+		io_schedule();
+	}
+	hrtimer_cancel(&timer.timer);
+	mode = HRTIMER_MODE_ABS;
+
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+
+	ktime_get_ts64(&tc);
+	entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
+}
+
+int iouring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct iopoll_info *entry;
+	int ret;
+	u32 index;
+
+	index = req->file->f_inode->i_rdev;
+	entry = xa_load(&ctx->poll_array, index);
+	io_delay(req, entry);
+	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+
+	ktime_get_ts64(&req->iopoll_end);
+	entry->last_runtime = req->iopoll_end.tv_nsec - req->iopoll_start.tv_nsec;
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1145,6 +1208,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
+		if (ctx->flags & IORING_SETUP_HY_POLL) {
+			ret = iouring_hybrid_poll(req, &iob, poll_flags);
+			goto comb;
+		}
+
 		if (req->opcode == IORING_OP_URING_CMD) {
 			struct io_uring_cmd *ioucmd;
 
@@ -1156,6 +1224,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 			ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
 		}
+comb:
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
-- 
2.40.1


       reply	other threads:[~2024-04-19  8:47 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20240418093150epcas5p31dc20cc737c72009265593f247e48262@epcas5p3.samsung.com>
2024-04-18  9:31 ` hexue [this message]
2024-04-19  9:09   ` [PATCH v2] io_uring: releasing CPU resources when polling Ammar Faizi
     [not found]     ` <CGME20240423033155epcas5p315a8e6ea0114402afafed84e5902ed6b@epcas5p3.samsung.com>
2024-04-23  3:31       ` hexue
2024-04-19 12:27   ` Pavel Begunkov
     [not found]     ` <CGME20240422081827epcas5p21ba3154cfcaa7520d7db412f5d3a15d2@epcas5p2.samsung.com>
2024-04-22  8:18       ` Re: io_uring: " hexue
2024-04-22 18:11   ` [PATCH v2] " Jens Axboe
     [not found]     ` <CGME20240423032657epcas5p4d97f377b48bd40e792d187013c5e409c@epcas5p4.samsung.com>
2024-04-23  3:26       ` hexue

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox