[PATCH v2] io_uring: releasing CPU resources when polling

public inbox for [email protected]
 help / color / mirror / Atom feed

* [PATCH v2] io_uring: releasing CPU resources when polling
       [not found] <CGME20240418093150epcas5p31dc20cc737c72009265593f247e48262@epcas5p3.samsung.com>
@ 2024-04-18  9:31 ` hexue
  2024-04-19  9:09   ` Ammar Faizi
                     ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: hexue @ 2024-04-18  9:31 UTC (permalink / raw)
  To: axboe
  Cc: asml.silence, linux-kernel, io-uring, peiwei.li, joshi.k,
	kundan.kumar, anuj20.g, wenwen.chen, ruyi.zhang, xiaobing.li,
	cliang01.li, hexue

This patch is intended to release the CPU resources of io_uring in
polling mode. When IO is issued, the program immediately polls for
check completion, which is a waste of CPU resources when IO commands
are executed on the disk.

I add the hybrid polling feature in io_uring, enables polling to
release a portion of CPU resources without affecting block layer.

- Record the running time and context switching time of each
  IO, and use these time to determine whether a process continue
  to schedule.

- Adaptive adjustment to different devices. Due to the real-time
  nature of time recording, each device's IO processing speed is
  different, so the CPU optimization effect will vary.

- Set a interface (ctx->flag) enables application to choose whether
  or not to use this feature.

The CPU optimization in peak workload of patch is tested as follows:
  all CPU utilization of original polling is 100% for per CPU, after
  optimization, the CPU utilization drop a lot (per CPU);

   read(128k, QD64, 1Job)     37%   write(128k, QD64, 1Job)     40%
   randread(4k, QD64, 16Job)  52%   randwrite(4k, QD64, 16Job)  12%

  Compared to original polling, the optimised performance reduction
  with peak workload within 1%.

   read  0.29%     write  0.51%    randread  0.09%    randwrite  0%

Reviewed-by: KANCHAN JOSHI <[email protected]>
Signed-off-by: hexue <[email protected]>
---
 include/linux/io_uring_types.h | 10 +++++
 include/uapi/linux/io_uring.h  |  1 +
 io_uring/io_uring.c            | 28 +++++++++++++-
 io_uring/io_uring.h            |  2 +
 io_uring/rw.c                  | 69 ++++++++++++++++++++++++++++++++++
 5 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 854ad67a5f70..7607fd8de91c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -224,6 +224,11 @@ struct io_alloc_cache {
 	size_t			elem_size;
 };
 
+struct iopoll_info {
+	long		last_runtime;
+	long		last_irqtime;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -421,6 +426,7 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+	struct xarray		poll_array;
 };
 
 struct io_tw_state {
@@ -641,6 +647,10 @@ struct io_kiocb {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
+	/* for hybrid iopoll */
+	int				poll_flag;
+	struct timespec64		iopoll_start;
+	struct timespec64		iopoll_end;
 };
 
 struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7a673b52827b..0038cdfec18f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -198,6 +198,7 @@ enum {
  * Removes indirection through the SQ index array.
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
+#define IORING_SETUP_HY_POLL	(1U << 17)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cd9a137ad6ce..bfb94e975f97 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -79,6 +79,8 @@
 
 #include <uapi/linux/io_uring.h>
 
+#include <linux/time.h>
+#include <linux/timekeeping.h>
 #include "io-wq.h"
 
 #include "io_uring.h"
@@ -311,6 +313,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	xa_init(&ctx->poll_array);
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -1875,10 +1878,28 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
 	return !!req->file;
 }
 
+void init_hybrid_poll_info(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	u32 index;
+
+	index = req->file->f_inode->i_rdev;
+	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+
+	if (!entry) {
+		entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
+		entry->last_runtime = 0;
+		entry->last_irqtime = 0;
+		xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
+	}
+
+	ktime_get_ts64(&req->iopoll_start);
+}
+
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 {
 	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	const struct cred *creds = NULL;
+	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	if (unlikely(!io_assign_file(req, def, issue_flags)))
@@ -1890,6 +1911,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	if (!def->audit_skip)
 		audit_uring_entry(req->opcode);
 
+	if (ctx->flags & IORING_SETUP_HY_POLL)
+		init_hybrid_poll_info(ctx, req);
+
 	ret = def->issue(req, issue_flags);
 
 	if (!def->audit_skip)
@@ -2176,6 +2200,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->file = NULL;
 	req->rsrc_node = NULL;
 	req->task = current;
+	req->poll_flag = 0;
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {
 		req->opcode = 0;
@@ -2921,6 +2946,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	kfree(ctx->cancel_table_locked.hbs);
 	kfree(ctx->io_bl);
 	xa_destroy(&ctx->io_bl_xa);
+	xa_destroy(&ctx->poll_array);
 	kfree(ctx);
 }
 
@@ -4050,7 +4076,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d5495710c178..d5b175826adb 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -125,6 +125,8 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
 	__io_req_task_work_add(req, 0);
 }
 
+#define LEFT_TIME 1000
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d5e79d9bdc71..ac73121030ee 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1118,6 +1118,69 @@ void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+void io_delay(struct io_kiocb *req, struct iopoll_info *entry)
+{
+	struct hrtimer_sleeper timer;
+	ktime_t kt;
+	struct timespec64 tc, oldtc;
+	enum hrtimer_mode mode;
+	long sleep_ti;
+
+	if (req->poll_flag == 1)
+		return;
+
+	if (entry->last_runtime <= entry->last_irqtime)
+		return;
+
+	/*
+	 * Avoid excessive scheduling time affecting performance
+	 * by using only 25 per cent of the remaining time
+	 */
+	sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;
+	if (sleep_ti < LEFT_TIME)
+		return;
+
+	ktime_get_ts64(&oldtc);
+	kt = ktime_set(0, sleep_ti);
+	req->poll_flag = 1;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+	if (timer.task) {
+		io_schedule();
+	}
+	hrtimer_cancel(&timer.timer);
+	mode = HRTIMER_MODE_ABS;
+
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+
+	ktime_get_ts64(&tc);
+	entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
+}
+
+int iouring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct iopoll_info *entry;
+	int ret;
+	u32 index;
+
+	index = req->file->f_inode->i_rdev;
+	entry = xa_load(&ctx->poll_array, index);
+	io_delay(req, entry);
+	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+
+	ktime_get_ts64(&req->iopoll_end);
+	entry->last_runtime = req->iopoll_end.tv_nsec - req->iopoll_start.tv_nsec;
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1145,6 +1208,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
+		if (ctx->flags & IORING_SETUP_HY_POLL) {
+			ret = iouring_hybrid_poll(req, &iob, poll_flags);
+			goto comb;
+		}
+
 		if (req->opcode == IORING_OP_URING_CMD) {
 			struct io_uring_cmd *ioucmd;
 
@@ -1156,6 +1224,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 			ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
 		}
+comb:
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] io_uring: releasing CPU resources when polling
  2024-04-18  9:31 ` [PATCH v2] io_uring: releasing CPU resources when polling hexue
@ 2024-04-19  9:09   ` Ammar Faizi
       [not found]     ` <CGME20240423033155epcas5p315a8e6ea0114402afafed84e5902ed6b@epcas5p3.samsung.com>
  2024-04-19 12:27   ` Pavel Begunkov
  2024-04-22 18:11   ` [PATCH v2] " Jens Axboe
  2 siblings, 1 reply; 7+ messages in thread
From: Ammar Faizi @ 2024-04-19  9:09 UTC (permalink / raw)
  To: hexue
  Cc: Linux Kernel Mailing List, io-uring Mailing List, Jens Axboe,
	Pavel Begunkov, Kanchan Joshi, Kundan Kumar, Anuj Gupta,
	Wenwen Chen, Ruyi Zhang, Xiaobing Li, cliang01.li, peiwei.li

On Fri, Apr 19, 2024 at 3:47 PM hexue wrote:
> +void init_hybrid_poll_info(struct io_ring_ctx *ctx, struct io_kiocb *req)
> +{
> +       u32 index;
> +
> +       index = req->file->f_inode->i_rdev;
> +       struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
> +
> +       if (!entry) {
> +               entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
> +               entry->last_runtime = 0;
> +               entry->last_irqtime = 0;
> +               xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
> +       }

GFP_KERNEL may fail; you must check for failure. Otherwise, it could
lead to NULL pointer dereference.

-- 
Ammar Faizi

^ permalink raw reply	[flat|nested] 7+ messages in thread

[parent not found: <CGME20240423033155epcas5p315a8e6ea0114402afafed84e5902ed6b@epcas5p3.samsung.com>]

* Re: Re: [PATCH v2] io_uring: releasing CPU resources when polling
       [not found]     ` <CGME20240423033155epcas5p315a8e6ea0114402afafed84e5902ed6b@epcas5p3.samsung.com>
@ 2024-04-23  3:31       ` hexue
  0 siblings, 0 replies; 7+ messages in thread
From: hexue @ 2024-04-23  3:31 UTC (permalink / raw)
  To: ammarfaizi2
  Cc: anuj20.g, asml.silence, axboe, cliang01.li, io-uring, joshi.k,
	kundan.kumar, linux-kernel, peiwei.li, ruyi.zhang, wenwen.chen,
	xiaobing.li, xue01.he

On Mon, Apr 19, 2024 at 16:09 Ammar Faizi wrote:
>On Fri, Apr 19, 2024 at 3:47 PM hexue wrote:
>> +void init_hybrid_poll_info(struct io_ring_ctx *ctx, struct io_kiocb *req)
>> +{
>> +       u32 index;
>> +
>> +       index = req->file->f_inode->i_rdev;
>> +       struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
>> +
>> +       if (!entry) {
>> +               entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
>> +               entry->last_runtime = 0;
>> +               entry->last_irqtime = 0;
>> +               xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
>> +       }
>
>GFP_KERNEL may fail; you must check for failure. Otherwise, it could
>lead to NULL pointer dereference.
>

yes, thanks for your correction, I will fix this and resubmit v3 patch.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] io_uring: releasing CPU resources when polling
  2024-04-18  9:31 ` [PATCH v2] io_uring: releasing CPU resources when polling hexue
  2024-04-19  9:09   ` Ammar Faizi
@ 2024-04-19 12:27   ` Pavel Begunkov
       [not found]     ` <CGME20240422081827epcas5p21ba3154cfcaa7520d7db412f5d3a15d2@epcas5p2.samsung.com>
  2024-04-22 18:11   ` [PATCH v2] " Jens Axboe
  2 siblings, 1 reply; 7+ messages in thread
From: Pavel Begunkov @ 2024-04-19 12:27 UTC (permalink / raw)
  To: hexue, axboe
  Cc: linux-kernel, io-uring, peiwei.li, joshi.k, kundan.kumar,
	anuj20.g, wenwen.chen, ruyi.zhang, xiaobing.li, cliang01.li

On 4/18/24 10:31, hexue wrote:
> This patch is intended to release the CPU resources of io_uring in
> polling mode. When IO is issued, the program immediately polls for
> check completion, which is a waste of CPU resources when IO commands
> are executed on the disk.
> 
> I add the hybrid polling feature in io_uring, enables polling to
> release a portion of CPU resources without affecting block layer.

So that's basically the block layer hybrid polling, which, to
remind, was removed not that long ago, but moved into io_uring.

> - Record the running time and context switching time of each
>    IO, and use these time to determine whether a process continue
>    to schedule.
> 
> - Adaptive adjustment to different devices. Due to the real-time
>    nature of time recording, each device's IO processing speed is
>    different, so the CPU optimization effect will vary.
> 
> - Set a interface (ctx->flag) enables application to choose whether
>    or not to use this feature.
> 
> The CPU optimization in peak workload of patch is tested as follows:
>    all CPU utilization of original polling is 100% for per CPU, after
>    optimization, the CPU utilization drop a lot (per CPU);

The first version was about cases that don't have iopoll queues.
How many IO poll queues did you have to get these numbers?


>     read(128k, QD64, 1Job)     37%   write(128k, QD64, 1Job)     40%
>     randread(4k, QD64, 16Job)  52%   randwrite(4k, QD64, 16Job)  12%
> 
>    Compared to original polling, the optimised performance reduction
>    with peak workload within 1%.
> 
>     read  0.29%     write  0.51%    randread  0.09%    randwrite  0%
> 
> Reviewed-by: KANCHAN JOSHI <[email protected]>

Kanchan, did you _really_ take a look at the patch?

> Signed-off-by: hexue <[email protected]>
> ---
>   include/linux/io_uring_types.h | 10 +++++
>   include/uapi/linux/io_uring.h  |  1 +
>   io_uring/io_uring.c            | 28 +++++++++++++-
>   io_uring/io_uring.h            |  2 +
>   io_uring/rw.c                  | 69 ++++++++++++++++++++++++++++++++++
>   5 files changed, 109 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 854ad67a5f70..7607fd8de91c 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -224,6 +224,11 @@ struct io_alloc_cache {
>   	size_t			elem_size;
>   };


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 7+ messages in thread

[parent not found: <CGME20240422081827epcas5p21ba3154cfcaa7520d7db412f5d3a15d2@epcas5p2.samsung.com>]

* Re: Re: io_uring: io_uring: releasing CPU resources when polling
       [not found]     ` <CGME20240422081827epcas5p21ba3154cfcaa7520d7db412f5d3a15d2@epcas5p2.samsung.com>
@ 2024-04-22  8:18       ` hexue
  0 siblings, 0 replies; 7+ messages in thread
From: hexue @ 2024-04-22  8:18 UTC (permalink / raw)
  To: asml.silence
  Cc: anuj20.g, axboe, cliang01.li, io-uring, joshi.k, kundan.kumar,
	linux-kernel, peiwei.li, ruyi.zhang, wenwen.chen, xiaobing.li,
	xue01.he

On 4/19/24 13:27, Pavel Begunkov wrote:
>On 4/18/24 10:31, hexue wrote:
>> This patch is intended to release the CPU resources of io_uring in
>> polling mode. When IO is issued, the program immediately polls for
>> check completion, which is a waste of CPU resources when IO commands
>> are executed on the disk.
>> 
>> I add the hybrid polling feature in io_uring, enables polling to
>> release a portion of CPU resources without affecting block layer.
>
>So that's basically the block layer hybrid polling, which, to
>remind, was removed not that long ago, but moved into io_uring.

The idea is based on the previous blcok layer hybrid poll, but
it's not just for single IO. I think hybrid poll is still an effective
CPU-saving solution, and I've tested it with good results on both PCIe
Gen4 and Gen5 nvme devices.

>> - Record the running time and context switching time of each
>>    IO, and use these time to determine whether a process continue
>>    to schedule.
>> 
>> - Adaptive adjustment to different devices. Due to the real-time
>>    nature of time recording, each device's IO processing speed is
>>    different, so the CPU optimization effect will vary.
>> 
>> - Set a interface (ctx->flag) enables application to choose whether
>>    or not to use this feature.
>> 
>> The CPU optimization in peak workload of patch is tested as follows:
>>    all CPU utilization of original polling is 100% for per CPU, after
>>    optimization, the CPU utilization drop a lot (per CPU);
>
>The first version was about cases that don't have iopoll queues.
>How many IO poll queues did you have to get these numbers?

The test enviroment has 8 CPU 16G mem, and I set 8 poll queues this case.
These data of the test from Gen4 disk.

>>     read(128k, QD64, 1Job)     37%   write(128k, QD64, 1Job)     40%
>>     randread(4k, QD64, 16Job)  52%   randwrite(4k, QD64, 16Job)  12%
>> 
>>    Compared to original polling, the optimised performance reduction
>>    with peak workload within 1%.
>> 
>>     read  0.29%     write  0.51%    randread  0.09%    randwrite  0%
>> 
>> Reviewed-by: KANCHAN JOSHI <[email protected]>
>
>Kanchan, did you _really_ take a look at the patch?
>

sorry I misunderstood the meaning of "reviewed", I've had some discussions
with Kanchan based on the test results, he just give some suggestions and
possible approach for changes but haven't reviewed the implementation yet.
This is my mistake, please ignore this "reviewed" message.

>> Signed-off-by: hexue <[email protected]>
>> ---
>>   include/linux/io_uring_types.h | 10 +++++
>>   include/uapi/linux/io_uring.h  |  1 +
>>   io_uring/io_uring.c            | 28 +++++++++++++-
>>   io_uring/io_uring.h            |  2 +
>>   io_uring/rw.c                  | 69 ++++++++++++++++++++++++++++++++++
>>   5 files changed, 109 insertions(+), 1 deletion(-)
>> 
>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>> index 854ad67a5f70..7607fd8de91c 100644
>> --- a/include/linux/io_uring_types.h
>> +++ b/include/linux/io_uring_types.h
>> @@ -224,6 +224,11 @@ struct io_alloc_cache {
>>   	size_t			elem_size;
>>   };
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] io_uring: releasing CPU resources when polling
  2024-04-18  9:31 ` [PATCH v2] io_uring: releasing CPU resources when polling hexue
  2024-04-19  9:09   ` Ammar Faizi
  2024-04-19 12:27   ` Pavel Begunkov
@ 2024-04-22 18:11   ` Jens Axboe
       [not found]     ` <CGME20240423032657epcas5p4d97f377b48bd40e792d187013c5e409c@epcas5p4.samsung.com>
  2 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2024-04-22 18:11 UTC (permalink / raw)
  To: hexue
  Cc: asml.silence, linux-kernel, io-uring, peiwei.li, joshi.k,
	kundan.kumar, anuj20.g, wenwen.chen, ruyi.zhang, xiaobing.li,
	cliang01.li

On 4/18/24 3:31 AM, hexue wrote:
> This patch is intended to release the CPU resources of io_uring in
> polling mode. When IO is issued, the program immediately polls for
> check completion, which is a waste of CPU resources when IO commands
> are executed on the disk.
> 
> I add the hybrid polling feature in io_uring, enables polling to
> release a portion of CPU resources without affecting block layer.
> 
> - Record the running time and context switching time of each
>   IO, and use these time to determine whether a process continue
>   to schedule.
> 
> - Adaptive adjustment to different devices. Due to the real-time
>   nature of time recording, each device's IO processing speed is
>   different, so the CPU optimization effect will vary.
> 
> - Set a interface (ctx->flag) enables application to choose whether
>   or not to use this feature.
> 
> The CPU optimization in peak workload of patch is tested as follows:
>   all CPU utilization of original polling is 100% for per CPU, after
>   optimization, the CPU utilization drop a lot (per CPU);
> 
>    read(128k, QD64, 1Job)     37%   write(128k, QD64, 1Job)     40%
>    randread(4k, QD64, 16Job)  52%   randwrite(4k, QD64, 16Job)  12%
> 
>   Compared to original polling, the optimised performance reduction
>   with peak workload within 1%.
> 
>    read  0.29%     write  0.51%    randread  0.09%    randwrite  0%

As mentioned, this is like a reworked version of the old hybrid polling
we had. The feature itself may make sense, but there's a slew of things
in this patch that aren't really acceptable. More below.

> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 854ad67a5f70..7607fd8de91c 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -224,6 +224,11 @@ struct io_alloc_cache {
>  	size_t			elem_size;
>  };
>  
> +struct iopoll_info {
> +	long		last_runtime;
> +	long		last_irqtime;
> +};
> +
>  struct io_ring_ctx {
>  	/* const or read-mostly hot data */
>  	struct {
> @@ -421,6 +426,7 @@ struct io_ring_ctx {
>  	unsigned short			n_sqe_pages;
>  	struct page			**ring_pages;
>  	struct page			**sqe_pages;
> +	struct xarray		poll_array;
>  };
>  
>  struct io_tw_state {
> @@ -641,6 +647,10 @@ struct io_kiocb {
>  		u64			extra1;
>  		u64			extra2;
>  	} big_cqe;
> +	/* for hybrid iopoll */
> +	int				poll_flag;
> +	struct timespec64		iopoll_start;
> +	struct timespec64		iopoll_end;
>  };

This is adding 4/8 + 16 + 16 bytes to the io_kiocb - or in other ways to
look at it, growing it by ~17% in size. That does not seem appropriate,
given the limited scope of the feature.

> @@ -1875,10 +1878,28 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
>  	return !!req->file;
>  }
>  
> +void init_hybrid_poll_info(struct io_ring_ctx *ctx, struct io_kiocb *req)
> +{
> +	u32 index;
> +
> +	index = req->file->f_inode->i_rdev;
> +	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);

Mixing code and declarations, that's a no go. This should look like:


	u32 index = req->file->f_inode->i_rdev;
	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);

Outside of that, this is now dipping into the inode from the hot path.
You could probably make do with f_inode here, as this is just a lookup
key?

It's also doing an extra lookup per polled IO. I guess the overhead is
fine as it's just for the hybrid setup, though not ideal.

> +
> +	if (!entry) {
> +		entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);

As also brought up, you need error checking on allocations.

> +		entry->last_runtime = 0;
> +		entry->last_irqtime = 0;
> +		xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
> +	}
> +
> +	ktime_get_ts64(&req->iopoll_start);
> +}

Time retrieval per IO is not cheap.

>  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
>  {
>  	const struct io_issue_def *def = &io_issue_defs[req->opcode];
>  	const struct cred *creds = NULL;
> +	struct io_ring_ctx *ctx = req->ctx;
>  	int ret;
>  
>  	if (unlikely(!io_assign_file(req, def, issue_flags)))
> @@ -1890,6 +1911,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
>  	if (!def->audit_skip)
>  		audit_uring_entry(req->opcode);
>  
> +	if (ctx->flags & IORING_SETUP_HY_POLL)
> +		init_hybrid_poll_info(ctx, req);
> +
>  	ret = def->issue(req, issue_flags);

Would probably be better to have this in the path of the opcodes that
actually support iopoll, rather than add a branch for any kind of IO.

> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index d5495710c178..d5b175826adb 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -125,6 +125,8 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
>  	__io_req_task_work_add(req, 0);
>  }
>  
> +#define LEFT_TIME 1000

This badly needs a comment and a better name...

> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index d5e79d9bdc71..ac73121030ee 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -1118,6 +1118,69 @@ void io_rw_fail(struct io_kiocb *req)
>  	io_req_set_res(req, res, req->cqe.flags);
>  }
>  
> +void io_delay(struct io_kiocb *req, struct iopoll_info *entry)
> +{
> +	struct hrtimer_sleeper timer;
> +	ktime_t kt;
> +	struct timespec64 tc, oldtc;
> +	enum hrtimer_mode mode;
> +	long sleep_ti;
> +
> +	if (req->poll_flag == 1)
> +		return;
> +
> +	if (entry->last_runtime <= entry->last_irqtime)
> +		return;
> +
> +	/*
> +	 * Avoid excessive scheduling time affecting performance
> +	 * by using only 25 per cent of the remaining time
> +	 */
> +	sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;
> +	if (sleep_ti < LEFT_TIME)
> +		return;
> +
> +	ktime_get_ts64(&oldtc);
> +	kt = ktime_set(0, sleep_ti);
> +	req->poll_flag = 1;
> +
> +	mode = HRTIMER_MODE_REL;
> +	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
> +	hrtimer_set_expires(&timer.timer, kt);
> +
> +	set_current_state(TASK_UNINTERRUPTIBLE);
> +	hrtimer_sleeper_start_expires(&timer, mode);
> +	if (timer.task) {
> +		io_schedule();
> +	}

Redundant braces.

> +	hrtimer_cancel(&timer.timer);
> +	mode = HRTIMER_MODE_ABS;
> +
> +	__set_current_state(TASK_RUNNING);
> +	destroy_hrtimer_on_stack(&timer.timer);
> +
> +	ktime_get_ts64(&tc);
> +	entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti;
> +}
> +
> +int iouring_hybrid_poll(struct io_kiocb *req, struct io_comp_batch *iob, unsigned int poll_flags)

Overly long line.

> +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +	struct io_ring_ctx *ctx = req->ctx;
> +	struct iopoll_info *entry;
> +	int ret;
> +	u32 index;
> +
> +	index = req->file->f_inode->i_rdev;

Ditto here on i_rdev vs inode.

> +	entry = xa_load(&ctx->poll_array, index);
> +	io_delay(req, entry);
> +	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
> +
> +	ktime_get_ts64(&req->iopoll_end);
> +	entry->last_runtime = req->iopoll_end.tv_nsec - req->iopoll_start.tv_nsec;
> +	return ret;
> +}
> +
>  int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>  {
>  	struct io_wq_work_node *pos, *start, *prev;
> @@ -1145,6 +1208,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>  		if (READ_ONCE(req->iopoll_completed))
>  			break;
>  
> +		if (ctx->flags & IORING_SETUP_HY_POLL) {
> +			ret = iouring_hybrid_poll(req, &iob, poll_flags);
> +			goto comb;
> +		}

comb?

> +
>  		if (req->opcode == IORING_OP_URING_CMD) {
>  			struct io_uring_cmd *ioucmd;
>  
> @@ -1156,6 +1224,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>  
>  			ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
>  		}
> +comb:
>  		if (unlikely(ret < 0))
>  			return ret;
>  		else if (ret)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

[parent not found: <CGME20240423032657epcas5p4d97f377b48bd40e792d187013c5e409c@epcas5p4.samsung.com>]

* Re: Re: [PATCH v2] io_uring: releasing CPU resources when polling
       [not found]     ` <CGME20240423032657epcas5p4d97f377b48bd40e792d187013c5e409c@epcas5p4.samsung.com>
@ 2024-04-23  3:26       ` hexue
  0 siblings, 0 replies; 7+ messages in thread
From: hexue @ 2024-04-23  3:26 UTC (permalink / raw)
  To: axboe
  Cc: anuj20.g, asml.silence, cliang01.li, io-uring, joshi.k,
	kundan.kumar, linux-kernel, peiwei.li, ruyi.zhang, wenwen.chen,
	xiaobing.li, xue01.he

On 4/22/24 18:11, Jens Axboe wrote:
>On 4/18/24 3:31 AM, hexue wrote:
>> This patch is intended to release the CPU resources of io_uring in
>> polling mode. When IO is issued, the program immediately polls for
>> check completion, which is a waste of CPU resources when IO commands
>> are executed on the disk.
>> 
>> I add the hybrid polling feature in io_uring, enables polling to
>> release a portion of CPU resources without affecting block layer.
>> 
>> - Record the running time and context switching time of each
>>   IO, and use these time to determine whether a process continue
>>   to schedule.
>> 
>> - Adaptive adjustment to different devices. Due to the real-time
>>   nature of time recording, each device's IO processing speed is
>>   different, so the CPU optimization effect will vary.
>> 
>> - Set a interface (ctx->flag) enables application to choose whether
>>   or not to use this feature.
>> 
>> The CPU optimization in peak workload of patch is tested as follows:
>>   all CPU utilization of original polling is 100% for per CPU, after
>>   optimization, the CPU utilization drop a lot (per CPU);
>> 
>>    read(128k, QD64, 1Job)     37%   write(128k, QD64, 1Job)     40%
>>    randread(4k, QD64, 16Job)  52%   randwrite(4k, QD64, 16Job)  12%
>> 
>>   Compared to original polling, the optimised performance reduction
>>   with peak workload within 1%.
>> 
>>    read  0.29%     write  0.51%    randread  0.09%    randwrite  0%
>
>As mentioned, this is like a reworked version of the old hybrid polling
>we had. The feature itself may make sense, but there's a slew of things
>in this patch that aren't really acceptable. More below.

Thank you very much for your patience in reviewing and correcting, I will
improve those as soon as possible and submit the v3 patch later.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-04-23  3:43 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20240418093150epcas5p31dc20cc737c72009265593f247e48262@epcas5p3.samsung.com>
2024-04-18  9:31 ` [PATCH v2] io_uring: releasing CPU resources when polling hexue
2024-04-19  9:09   ` Ammar Faizi
     [not found]     ` <CGME20240423033155epcas5p315a8e6ea0114402afafed84e5902ed6b@epcas5p3.samsung.com>
2024-04-23  3:31       ` hexue
2024-04-19 12:27   ` Pavel Begunkov
     [not found]     ` <CGME20240422081827epcas5p21ba3154cfcaa7520d7db412f5d3a15d2@epcas5p2.samsung.com>
2024-04-22  8:18       ` Re: io_uring: " hexue
2024-04-22 18:11   ` [PATCH v2] " Jens Axboe
     [not found]     ` <CGME20240423032657epcas5p4d97f377b48bd40e792d187013c5e409c@epcas5p4.samsung.com>
2024-04-23  3:26       ` hexue

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox