public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v6 RESEND] io_uring: releasing CPU resources when polling
       [not found] <CGME20240709092951epcas5p24783c3bb5c23277cf23a72a6e1855751@epcas5p2.samsung.com>
@ 2024-07-09  9:29 ` hexue
       [not found]   ` <CGME20240718100113epcas5p255acf51a58cf410d5d0e8cffbca41994@epcas5p2.samsung.com>
  2024-07-22 12:31   ` [PATCH v6 RESEND] " Pavel Begunkov
  0 siblings, 2 replies; 4+ messages in thread
From: hexue @ 2024-07-09  9:29 UTC (permalink / raw)
  To: axboe; +Cc: asml.silence, io-uring, linux-kernel, hexue

io_uring use polling mode could improve the IO performence, but it will
spend 100% of CPU resources to do polling.

This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
a interface for user to enable a new hybrid polling at io_uring level.

A new hybrid poll is implemented on the io_uring layer. Once IO issued,
it will not polling immediately, but block first and re-run before IO
complete, then poll to reap IO. This poll function could be a suboptimal
solution when running on a single thread, it offers the performance lower
than regular polling but higher than IRQ, and CPU utilization is also lower
than polling.

Test Result
fio-3.35, Gen 4 device
-------------------------------------------------------------------------------------
Performance
-------------------------------------------------------------------------------------
                  write          read           randwrite       randread
regular poll    BW=3939MiB/s    BW=6596MiB/s    IOPS=190K       IOPS=526K
IRQ             BW=3927MiB/s    BW=6567MiB/s    IOPS=181K       IOPS=216K
hybrid poll     BW=3933MiB/s    BW=6600MiB/s    IOPS=190K       IOPS=390K(suboptimal)
-------------------------------------------------------------------------------------
CPU Utilization
-------------------------------------------------------------------------------------
                write   read    randwrite       randread
regular poll    100%    100%    100%            100%
IRQ             38%     53%     100%            100%
hybrid poll     76%     32%     70%              85%
-------------------------------------------------------------------------------------

--
changes since v5:
- Remove cstime recorder
- Use minimize sleep time in different drivers
- Use the half of whole runtime to do schedule
- Consider as a suboptimal solution between
  regular poll and IRQ

changes since v4:
- Rewrote the commit
- Update the test results
- Reorganized the code basd on 6.11

changes since v3:
- Simplified the commit
- Add some comments on code

changes since v2:
- Modified some formatting errors
- Move judgement to poll path

changes since v1:
- Extend hybrid poll to async polled io

Signed-off-by: hexue <[email protected]>
---
 include/linux/io_uring_types.h |  6 +++
 include/uapi/linux/io_uring.h  |  1 +
 io_uring/io_uring.c            |  3 +-
 io_uring/rw.c                  | 74 +++++++++++++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 91224bbcfa73..0897126fb2d7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,6 +428,8 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+	/* for hybrid poll*/
+	u64			available_time;
 };
 
 struct io_tw_state {
@@ -665,6 +667,10 @@ struct io_kiocb {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
+    /* for hybrid iopoll */
+	bool		poll_state;
+	u64			iopoll_start;
+	u64			iopoll_end;
 };
 
 struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 994bf7af0efe..ef32ec319d1f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -199,6 +199,7 @@ enum io_uring_sqe_flags_bit {
  * Removes indirection through the SQ index array.
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
+#define IORING_SETUP_HY_POLL	(1U << 17)
 
 enum io_uring_op {
 	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 816e93e7f949..b38f8af118c5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -299,6 +299,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	ctx->available_time = LLONG_MAX;
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -3637,7 +3638,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1a2128459cb4..5505f4292ce5 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -772,6 +772,13 @@ static bool need_complete_io(struct io_kiocb *req)
 		S_ISBLK(file_inode(req->file)->i_mode);
 }
 
+static void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	/* make sure every req only block once*/
+	req->poll_state = false;
+	req->iopoll_start = ktime_get_ns();
+}
+
 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -809,6 +816,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
+		if (ctx->flags & IORING_SETUP_HY_POLL)
+			init_hybrid_poll(ctx, req);
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
 			return -EINVAL;
@@ -1106,6 +1115,67 @@ void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+static u64 io_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct hrtimer_sleeper timer;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+	u64 sleep_time;
+
+	if (req->poll_state)
+		return 0;
+
+	if (ctx->available_time == LLONG_MAX)
+		return 0;
+
+	/* Using half running time to do schedul */
+	sleep_time = ctx->available_time / 2;
+
+	kt = ktime_set(0, sleep_time);
+	req->poll_state = true;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+	set_current_state(TASK_INTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+
+	if (timer.task)
+		io_schedule();
+
+	hrtimer_cancel(&timer.timer);
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+
+	return sleep_time;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+				struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+	u64 runtime, sleep_time;
+
+	sleep_time = io_delay(ctx, req);
+
+	/* it doesn't implement with io_uring passthrough now */
+	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+
+	req->iopoll_end = ktime_get_ns();
+	runtime = req->iopoll_end - req->iopoll_start - sleep_time;
+	if (runtime < 0)
+		return 0;
+
+	/* use minimize sleep time if there are different speed
+	 * drivers, it could get more completions from fast one
+	 */
+	if (ctx->available_time > runtime)
+		ctx->available_time = runtime;
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1133,7 +1203,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		if (req->opcode == IORING_OP_URING_CMD) {
+		if (ctx->flags & IORING_SETUP_HY_POLL) {
+			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+		} else if (req->opcode == IORING_OP_URING_CMD) {
 			struct io_uring_cmd *ioucmd;
 
 			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH V6] io_uring: releasing CPU resources when polling
       [not found]   ` <CGME20240718100113epcas5p255acf51a58cf410d5d0e8cffbca41994@epcas5p2.samsung.com>
@ 2024-07-18 10:01     ` hexue
  2024-07-19 19:05       ` Jens Axboe
  0 siblings, 1 reply; 4+ messages in thread
From: hexue @ 2024-07-18 10:01 UTC (permalink / raw)
  To: axboe; +Cc: asml.silence, io-uring, linux-kernel

On 09/07/24 9:29AM, hexue wrote:
>io_uring use polling mode could improve the IO performence, but it will
>spend 100% of CPU resources to do polling.
>
>This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
>a interface for user to enable a new hybrid polling at io_uring level.

Hi, just a gentle ping. Any coments on this patch?
--
hexue

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V6] io_uring: releasing CPU resources when polling
  2024-07-18 10:01     ` [PATCH V6] " hexue
@ 2024-07-19 19:05       ` Jens Axboe
  0 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2024-07-19 19:05 UTC (permalink / raw)
  To: hexue; +Cc: asml.silence, io-uring, linux-kernel

On 7/18/24 4:01 AM, hexue wrote:
> On 09/07/24 9:29AM, hexue wrote:
>> io_uring use polling mode could improve the IO performence, but it will
>> spend 100% of CPU resources to do polling.
>>
>> This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
>> a interface for user to enable a new hybrid polling at io_uring level.
> 
> Hi, just a gentle ping. Any coments on this patch?

It's merge window and vacation time, and related to the former of those
two, any changes for this would have to be targeted to the next kernel
release. So it'll get looked at once things settle a bit, there's no
rush as the 6.11 merge window is already in progress.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v6 RESEND] io_uring: releasing CPU resources when polling
  2024-07-09  9:29 ` [PATCH v6 RESEND] io_uring: releasing CPU resources when polling hexue
       [not found]   ` <CGME20240718100113epcas5p255acf51a58cf410d5d0e8cffbca41994@epcas5p2.samsung.com>
@ 2024-07-22 12:31   ` Pavel Begunkov
  1 sibling, 0 replies; 4+ messages in thread
From: Pavel Begunkov @ 2024-07-22 12:31 UTC (permalink / raw)
  To: hexue, axboe; +Cc: io-uring, linux-kernel

On 7/9/24 10:29, hexue wrote:
> io_uring use polling mode could improve the IO performence, but it will
> spend 100% of CPU resources to do polling.
> 
> This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
> a interface for user to enable a new hybrid polling at io_uring level.
> 
> A new hybrid poll is implemented on the io_uring layer. Once IO issued,
> it will not polling immediately, but block first and re-run before IO
> complete, then poll to reap IO. This poll function could be a suboptimal
> solution when running on a single thread, it offers the performance lower
> than regular polling but higher than IRQ, and CPU utilization is also lower
> than polling.
> 
> Test Result
> fio-3.35, Gen 4 device
> -------------------------------------------------------------------------------------
> Performance
> -------------------------------------------------------------------------------------
>                    write          read           randwrite       randread
> regular poll    BW=3939MiB/s    BW=6596MiB/s    IOPS=190K       IOPS=526K
> IRQ             BW=3927MiB/s    BW=6567MiB/s    IOPS=181K       IOPS=216K
> hybrid poll     BW=3933MiB/s    BW=6600MiB/s    IOPS=190K       IOPS=390K(suboptimal)
> -------------------------------------------------------------------------------------
> CPU Utilization
> -------------------------------------------------------------------------------------
>                  write   read    randwrite       randread
> regular poll    100%    100%    100%            100%
> IRQ             38%     53%     100%            100%
> hybrid poll     76%     32%     70%              85%
> -------------------------------------------------------------------------------------
> 
> --
...
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 1a2128459cb4..5505f4292ce5 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> +static int io_uring_hybrid_poll(struct io_kiocb *req,
> +				struct io_comp_batch *iob, unsigned int poll_flags)
> +{
> +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +	struct io_ring_ctx *ctx = req->ctx;
> +	int ret;
> +	u64 runtime, sleep_time;
> +
> +	sleep_time = io_delay(ctx, req);
> +
> +	/* it doesn't implement with io_uring passthrough now */
> +	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);

->iopoll vs ->uring_cmd_iopoll, same comment as in my
previous review


> +
> +	req->iopoll_end = ktime_get_ns();
> +	runtime = req->iopoll_end - req->iopoll_start - sleep_time;
> +	if (runtime < 0)
> +		return 0;
> +
> +	/* use minimize sleep time if there are different speed
> +	 * drivers, it could get more completions from fast one
> +	 */
> +	if (ctx->available_time > runtime)
> +		ctx->available_time = runtime;
> +	return ret;
> +}
> +
>   int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   {
>   	struct io_wq_work_node *pos, *start, *prev;
> @@ -1133,7 +1203,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>   		if (READ_ONCE(req->iopoll_completed))
>   			break;
>   
> -		if (req->opcode == IORING_OP_URING_CMD) {
> +		if (ctx->flags & IORING_SETUP_HY_POLL) {
> +			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
> +		} else if (req->opcode == IORING_OP_URING_CMD) {
>   			struct io_uring_cmd *ioucmd;
>   
>   			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-07-22 12:30 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20240709092951epcas5p24783c3bb5c23277cf23a72a6e1855751@epcas5p2.samsung.com>
2024-07-09  9:29 ` [PATCH v6 RESEND] io_uring: releasing CPU resources when polling hexue
     [not found]   ` <CGME20240718100113epcas5p255acf51a58cf410d5d0e8cffbca41994@epcas5p2.samsung.com>
2024-07-18 10:01     ` [PATCH V6] " hexue
2024-07-19 19:05       ` Jens Axboe
2024-07-22 12:31   ` [PATCH v6 RESEND] " Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox