public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-16 20:38 [PATCHSET v3 0/5] Add support for batched min timeout Jens Axboe
@ 2024-08-16 20:38 ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-16 20:38 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

In preparation for having two distinct timeouts and avoid waking the
task if we don't need to.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
 io_uring/io_uring.h |  2 ++
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2bdb66902f58..6e53ebd58aab 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
 	return percpu_counter_read_positive(&tctx->inflight);
 }
 
+static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	WRITE_ONCE(iowq->hit_timeout, 1);
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		wake_up_process(ctx->submitter_task);
+	else
+		io_cqring_wake(ctx);
+	return HRTIMER_NORESTART;
+}
+
+static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
+				      clockid_t clock_id)
+{
+	iowq->hit_timeout = 0;
+	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
+
+	if (!READ_ONCE(iowq->hit_timeout))
+		schedule();
+
+	hrtimer_cancel(&iowq->t);
+	destroy_hrtimer_on_stack(&iowq->t);
+	__set_current_state(TASK_RUNNING);
+
+	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
+}
+
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {
@@ -2362,11 +2394,10 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (!iowq->no_iowait && current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout == KTIME_MAX)
+	if (iowq->timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
+	else
 		schedule();
-	else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0,
-						 HRTIMER_MODE_ABS, ctx->clockid))
-		ret = -ETIME;
 	current->in_iowait = 0;
 	return ret;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e0868b79774c..bac830a2d6ec 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -40,7 +40,9 @@ struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+	int hit_timeout;
 	ktime_t timeout;
+	struct hrtimer t;
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	ktime_t napi_busy_poll_dt;
 	bool napi_prefer_busy_poll;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-19 23:28 [PATCHSET v4 0/5] Add support for batched min timeout Jens Axboe
@ 2024-08-19 23:28 ` Jens Axboe
  2024-08-20 20:08   ` David Wei
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-19 23:28 UTC (permalink / raw)
  To: io-uring; +Cc: dw, Jens Axboe

In preparation for having two distinct timeouts and avoid waking the
task if we don't need to.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
 io_uring/io_uring.h |  2 ++
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9e2b8d4c05db..ddfbe04c61ed 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
 	return percpu_counter_read_positive(&tctx->inflight);
 }
 
+static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	WRITE_ONCE(iowq->hit_timeout, 1);
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+		wake_up_process(ctx->submitter_task);
+	else
+		io_cqring_wake(ctx);
+	return HRTIMER_NORESTART;
+}
+
+static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
+				      clockid_t clock_id)
+{
+	iowq->hit_timeout = 0;
+	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
+
+	if (!READ_ONCE(iowq->hit_timeout))
+		schedule();
+
+	hrtimer_cancel(&iowq->t);
+	destroy_hrtimer_on_stack(&iowq->t);
+	__set_current_state(TASK_RUNNING);
+
+	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
+}
+
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {
@@ -2362,11 +2394,10 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout == KTIME_MAX)
+	if (iowq->timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
+	else
 		schedule();
-	else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0,
-						 HRTIMER_MODE_ABS, ctx->clockid))
-		ret = -ETIME;
 	current->in_iowait = 0;
 	return ret;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9935819f12b7..f95c1b080f4b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -40,7 +40,9 @@ struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+	int hit_timeout;
 	ktime_t timeout;
+	struct hrtimer t;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	ktime_t napi_busy_poll_dt;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-19 23:28 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe
@ 2024-08-20 20:08   ` David Wei
  2024-08-20 21:34     ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: David Wei @ 2024-08-20 20:08 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 2024-08-19 16:28, Jens Axboe wrote:
> In preparation for having two distinct timeouts and avoid waking the
> task if we don't need to.
> 
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>  io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>  io_uring/io_uring.h |  2 ++
>  2 files changed, 38 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 9e2b8d4c05db..ddfbe04c61ed 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>  	 * the task, and the next invocation will do it.
>  	 */
> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)

iowq->hit_timeout may be modified in a timer softirq context, while this
wait_queue_func_t (AIUI) may get called from any context e.g.
net_rx_softirq for sockets. Does this need a READ_ONLY()?

>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>  	return -1;
>  }
> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>  	return percpu_counter_read_positive(&tctx->inflight);
>  }
>  
> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
> +{
> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	WRITE_ONCE(iowq->hit_timeout, 1);
> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
> +		wake_up_process(ctx->submitter_task);
> +	else
> +		io_cqring_wake(ctx);

This is a bit different to schedule_hrtimeout_range_clock(). Why is
io_cqring_wake() needed here for non-DEFER_TASKRUN?

> +	return HRTIMER_NORESTART;
> +}
> +
> +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
> +				      clockid_t clock_id)
> +{
> +	iowq->hit_timeout = 0;
> +	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
> +	iowq->t.function = io_cqring_timer_wakeup;
> +	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
> +	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
> +
> +	if (!READ_ONCE(iowq->hit_timeout))
> +		schedule();
> +
> +	hrtimer_cancel(&iowq->t);
> +	destroy_hrtimer_on_stack(&iowq->t);
> +	__set_current_state(TASK_RUNNING);
> +
> +	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
> +}
> +
>  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>  				     struct io_wait_queue *iowq)
>  {
> @@ -2362,11 +2394,10 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>  	 */
>  	if (current_pending_io())
>  		current->in_iowait = 1;
> -	if (iowq->timeout == KTIME_MAX)
> +	if (iowq->timeout != KTIME_MAX)
> +		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
> +	else
>  		schedule();
> -	else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0,
> -						 HRTIMER_MODE_ABS, ctx->clockid))
> -		ret = -ETIME;
>  	current->in_iowait = 0;
>  	return ret;
>  }
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 9935819f12b7..f95c1b080f4b 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -40,7 +40,9 @@ struct io_wait_queue {
>  	struct io_ring_ctx *ctx;
>  	unsigned cq_tail;
>  	unsigned nr_timeouts;
> +	int hit_timeout;
>  	ktime_t timeout;
> +	struct hrtimer t;
>  
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>  	ktime_t napi_busy_poll_dt;

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 20:08   ` David Wei
@ 2024-08-20 21:34     ` Jens Axboe
  2024-08-20 21:37       ` David Wei
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-20 21:34 UTC (permalink / raw)
  To: David Wei, io-uring

On 8/20/24 2:08 PM, David Wei wrote:
> On 2024-08-19 16:28, Jens Axboe wrote:
>> In preparation for having two distinct timeouts and avoid waking the
>> task if we don't need to.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
>> ---
>>  io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>>  io_uring/io_uring.h |  2 ++
>>  2 files changed, 38 insertions(+), 5 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 9e2b8d4c05db..ddfbe04c61ed 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>  	 * the task, and the next invocation will do it.
>>  	 */
>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
> 
> iowq->hit_timeout may be modified in a timer softirq context, while this
> wait_queue_func_t (AIUI) may get called from any context e.g.
> net_rx_softirq for sockets. Does this need a READ_ONLY()?

Yes probably not a bad idea to make it READ_ONCE().

>>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>>  	return -1;
>>  }
>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>>  	return percpu_counter_read_positive(&tctx->inflight);
>>  }
>>  
>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>> +{
>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>> +	struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +	WRITE_ONCE(iowq->hit_timeout, 1);
>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>> +		wake_up_process(ctx->submitter_task);
>> +	else
>> +		io_cqring_wake(ctx);
> 
> This is a bit different to schedule_hrtimeout_range_clock(). Why is
> io_cqring_wake() needed here for non-DEFER_TASKRUN?

That's how the wakeups work - for defer taskrun, the task isn't on a
waitqueue at all. Hence we need to wake the task itself. For any other
setup, they will be on the waitqueue, and we just call io_cqring_wake()
to wake up anyone waiting on the waitqueue. That will iterate the wake
queue and call handlers for each item. Having a separate handler for
that will allow to NOT wake up the task if we don't need to.
taskrun, the waker

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 21:34     ` Jens Axboe
@ 2024-08-20 21:37       ` David Wei
  2024-08-20 21:39         ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: David Wei @ 2024-08-20 21:37 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 2024-08-20 14:34, Jens Axboe wrote:
> On 8/20/24 2:08 PM, David Wei wrote:
>> On 2024-08-19 16:28, Jens Axboe wrote:
>>> In preparation for having two distinct timeouts and avoid waking the
>>> task if we don't need to.
>>>
>>> Signed-off-by: Jens Axboe <[email protected]>
>>> ---
>>>  io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>>>  io_uring/io_uring.h |  2 ++
>>>  2 files changed, 38 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> index 9e2b8d4c05db..ddfbe04c61ed 100644
>>> --- a/io_uring/io_uring.c
>>> +++ b/io_uring/io_uring.c
>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>>  	 * the task, and the next invocation will do it.
>>>  	 */
>>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>>
>> iowq->hit_timeout may be modified in a timer softirq context, while this
>> wait_queue_func_t (AIUI) may get called from any context e.g.
>> net_rx_softirq for sockets. Does this need a READ_ONLY()?
> 
> Yes probably not a bad idea to make it READ_ONCE().
> 
>>>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>>>  	return -1;
>>>  }
>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>>>  	return percpu_counter_read_positive(&tctx->inflight);
>>>  }
>>>  
>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>> +{
>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>> +
>>> +	WRITE_ONCE(iowq->hit_timeout, 1);
>>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>> +		wake_up_process(ctx->submitter_task);
>>> +	else
>>> +		io_cqring_wake(ctx);
>>
>> This is a bit different to schedule_hrtimeout_range_clock(). Why is
>> io_cqring_wake() needed here for non-DEFER_TASKRUN?
> 
> That's how the wakeups work - for defer taskrun, the task isn't on a
> waitqueue at all. Hence we need to wake the task itself. For any other
> setup, they will be on the waitqueue, and we just call io_cqring_wake()
> to wake up anyone waiting on the waitqueue. That will iterate the wake
> queue and call handlers for each item. Having a separate handler for
> that will allow to NOT wake up the task if we don't need to.
> taskrun, the waker

To rephase the question, why is the original code calling
schedule_hrtimeout_range_clock() not needing to differentiate behaviour
between defer taskrun and not?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 21:37       ` David Wei
@ 2024-08-20 21:39         ` Jens Axboe
  2024-08-20 22:04           ` Pavel Begunkov
  2024-08-20 22:06           ` David Wei
  0 siblings, 2 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-20 21:39 UTC (permalink / raw)
  To: David Wei, io-uring

On 8/20/24 3:37 PM, David Wei wrote:
> On 2024-08-20 14:34, Jens Axboe wrote:
>> On 8/20/24 2:08 PM, David Wei wrote:
>>> On 2024-08-19 16:28, Jens Axboe wrote:
>>>> In preparation for having two distinct timeouts and avoid waking the
>>>> task if we don't need to.
>>>>
>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>> ---
>>>>  io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>>>>  io_uring/io_uring.h |  2 ++
>>>>  2 files changed, 38 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644
>>>> --- a/io_uring/io_uring.c
>>>> +++ b/io_uring/io_uring.c
>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>>>  	 * the task, and the next invocation will do it.
>>>>  	 */
>>>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>>>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>>>
>>> iowq->hit_timeout may be modified in a timer softirq context, while this
>>> wait_queue_func_t (AIUI) may get called from any context e.g.
>>> net_rx_softirq for sockets. Does this need a READ_ONLY()?
>>
>> Yes probably not a bad idea to make it READ_ONCE().
>>
>>>>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>>>>  	return -1;
>>>>  }
>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>>>>  	return percpu_counter_read_positive(&tctx->inflight);
>>>>  }
>>>>  
>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>> +{
>>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>>> +
>>>> +	WRITE_ONCE(iowq->hit_timeout, 1);
>>>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>> +		wake_up_process(ctx->submitter_task);
>>>> +	else
>>>> +		io_cqring_wake(ctx);
>>>
>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is
>>> io_cqring_wake() needed here for non-DEFER_TASKRUN?
>>
>> That's how the wakeups work - for defer taskrun, the task isn't on a
>> waitqueue at all. Hence we need to wake the task itself. For any other
>> setup, they will be on the waitqueue, and we just call io_cqring_wake()
>> to wake up anyone waiting on the waitqueue. That will iterate the wake
>> queue and call handlers for each item. Having a separate handler for
>> that will allow to NOT wake up the task if we don't need to.
>> taskrun, the waker
> 
> To rephase the question, why is the original code calling
> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
> between defer taskrun and not?

Because that part is the same, the task schedules out and goes to sleep.
That has always been the same regardless of how the ring is setup. Only
difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
manually.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 21:39         ` Jens Axboe
@ 2024-08-20 22:04           ` Pavel Begunkov
  2024-08-20 22:06           ` David Wei
  1 sibling, 0 replies; 29+ messages in thread
From: Pavel Begunkov @ 2024-08-20 22:04 UTC (permalink / raw)
  To: Jens Axboe, David Wei, io-uring

On 8/20/24 22:39, Jens Axboe wrote:
> On 8/20/24 3:37 PM, David Wei wrote:
>> On 2024-08-20 14:34, Jens Axboe wrote:
>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>> On 2024-08-19 16:28, Jens Axboe wrote:
>>>>> In preparation for having two distinct timeouts and avoid waking the
>>>>> task if we don't need to.
>>>>>
>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>> ---
>>>>>   io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>>>>>   io_uring/io_uring.h |  2 ++
>>>>>   2 files changed, 38 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644
>>>>> --- a/io_uring/io_uring.c
>>>>> +++ b/io_uring/io_uring.c
>>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>>>   	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>>>>   	 * the task, and the next invocation will do it.
>>>>>   	 */
>>>>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>>>>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>>>>
>>>> iowq->hit_timeout may be modified in a timer softirq context, while this
>>>> wait_queue_func_t (AIUI) may get called from any context e.g.
>>>> net_rx_softirq for sockets. Does this need a READ_ONLY()?
>>>
>>> Yes probably not a bad idea to make it READ_ONCE().
>>>
>>>>>   		return autoremove_wake_function(curr, mode, wake_flags, key);
>>>>>   	return -1;
>>>>>   }
>>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>>>>>   	return percpu_counter_read_positive(&tctx->inflight);
>>>>>   }
>>>>>   
>>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>>> +{
>>>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>>>> +
>>>>> +	WRITE_ONCE(iowq->hit_timeout, 1);
>>>>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>>> +		wake_up_process(ctx->submitter_task);
>>>>> +	else
>>>>> +		io_cqring_wake(ctx);
>>>>
>>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is
>>>> io_cqring_wake() needed here for non-DEFER_TASKRUN?
>>>
>>> That's how the wakeups work - for defer taskrun, the task isn't on a
>>> waitqueue at all. Hence we need to wake the task itself. For any other
>>> setup, they will be on the waitqueue, and we just call io_cqring_wake()
>>> to wake up anyone waiting on the waitqueue. That will iterate the wake
>>> queue and call handlers for each item. Having a separate handler for
>>> that will allow to NOT wake up the task if we don't need to.
>>> taskrun, the waker
>>
>> To rephase the question, why is the original code calling
>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>> between defer taskrun and not?
> 
> Because that part is the same, the task schedules out and goes to sleep.
> That has always been the same regardless of how the ring is setup. Only
> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
> manually.

Answering the question, for !IORING_SETUP_DEFER_TASKRUN, before it
was waking only the task that started the timeout. Now,
io_cqring_wake(ctx) wakes all waiters, so if there are multiple tasks
waiting, a timeout of one waiter will try to wake all of them.

I believe it's unintentional, but I don't think we care either. You
can save the waiter task struct and wake it specifically.


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 21:39         ` Jens Axboe
  2024-08-20 22:04           ` Pavel Begunkov
@ 2024-08-20 22:06           ` David Wei
  2024-08-20 22:13             ` Pavel Begunkov
  1 sibling, 1 reply; 29+ messages in thread
From: David Wei @ 2024-08-20 22:06 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 2024-08-20 14:39, Jens Axboe wrote:
> On 8/20/24 3:37 PM, David Wei wrote:
>> On 2024-08-20 14:34, Jens Axboe wrote:
>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>> On 2024-08-19 16:28, Jens Axboe wrote:
>>>>> In preparation for having two distinct timeouts and avoid waking the
>>>>> task if we don't need to.
>>>>>
>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>> ---
>>>>>  io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>>>>>  io_uring/io_uring.h |  2 ++
>>>>>  2 files changed, 38 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644
>>>>> --- a/io_uring/io_uring.c
>>>>> +++ b/io_uring/io_uring.c
>>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>>>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>>>>  	 * the task, and the next invocation will do it.
>>>>>  	 */
>>>>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>>>>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>>>>
>>>> iowq->hit_timeout may be modified in a timer softirq context, while this
>>>> wait_queue_func_t (AIUI) may get called from any context e.g.
>>>> net_rx_softirq for sockets. Does this need a READ_ONLY()?
>>>
>>> Yes probably not a bad idea to make it READ_ONCE().
>>>
>>>>>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>>>>>  	return -1;
>>>>>  }
>>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>>>>>  	return percpu_counter_read_positive(&tctx->inflight);
>>>>>  }
>>>>>  
>>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>>> +{
>>>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>>>> +
>>>>> +	WRITE_ONCE(iowq->hit_timeout, 1);
>>>>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>>> +		wake_up_process(ctx->submitter_task);
>>>>> +	else
>>>>> +		io_cqring_wake(ctx);
>>>>
>>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is
>>>> io_cqring_wake() needed here for non-DEFER_TASKRUN?
>>>
>>> That's how the wakeups work - for defer taskrun, the task isn't on a
>>> waitqueue at all. Hence we need to wake the task itself. For any other
>>> setup, they will be on the waitqueue, and we just call io_cqring_wake()
>>> to wake up anyone waiting on the waitqueue. That will iterate the wake
>>> queue and call handlers for each item. Having a separate handler for
>>> that will allow to NOT wake up the task if we don't need to.
>>> taskrun, the waker
>>
>> To rephase the question, why is the original code calling
>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>> between defer taskrun and not?
> 
> Because that part is the same, the task schedules out and goes to sleep.
> That has always been the same regardless of how the ring is setup. Only
> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
> manually.
> 

io_cqring_timer_wakeup() is the timer expired callback which calls
wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN.

The original code calling schedule_hrtimeout_range_clock() uses
hrtimer_sleeper instead, which has a default timer expired callback set
to hrtimer_wakeup().

hrtimer_wakeup() only calls wake_up_process().

My question is: why this asymmetry? Why does the new custom callback
require io_cqring_wake()?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 22:06           ` David Wei
@ 2024-08-20 22:13             ` Pavel Begunkov
  2024-08-20 22:14               ` David Wei
  0 siblings, 1 reply; 29+ messages in thread
From: Pavel Begunkov @ 2024-08-20 22:13 UTC (permalink / raw)
  To: David Wei, Jens Axboe, io-uring

On 8/20/24 23:06, David Wei wrote:
> On 2024-08-20 14:39, Jens Axboe wrote:
>> On 8/20/24 3:37 PM, David Wei wrote:
>>> On 2024-08-20 14:34, Jens Axboe wrote:
>>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>>> On 2024-08-19 16:28, Jens Axboe wrote:
>>>>>> In preparation for having two distinct timeouts and avoid waking the
>>>>>> task if we don't need to.
>>>>>>
>>>>>> Signed-off-by: Jens Axboe <[email protected]>
>>>>>> ---
>>>>>>   io_uring/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++-----
>>>>>>   io_uring/io_uring.h |  2 ++
>>>>>>   2 files changed, 38 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>>>> index 9e2b8d4c05db..ddfbe04c61ed 100644
>>>>>> --- a/io_uring/io_uring.c
>>>>>> +++ b/io_uring/io_uring.c
>>>>>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>>>>   	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>>>>>   	 * the task, and the next invocation will do it.
>>>>>>   	 */
>>>>>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>>>>>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>>>>>
>>>>> iowq->hit_timeout may be modified in a timer softirq context, while this
>>>>> wait_queue_func_t (AIUI) may get called from any context e.g.
>>>>> net_rx_softirq for sockets. Does this need a READ_ONLY()?
>>>>
>>>> Yes probably not a bad idea to make it READ_ONCE().
>>>>
>>>>>>   		return autoremove_wake_function(curr, mode, wake_flags, key);
>>>>>>   	return -1;
>>>>>>   }
>>>>>> @@ -2350,6 +2350,38 @@ static bool current_pending_io(void)
>>>>>>   	return percpu_counter_read_positive(&tctx->inflight);
>>>>>>   }
>>>>>>   
>>>>>> +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>>>> +{
>>>>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>>>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>>>>> +
>>>>>> +	WRITE_ONCE(iowq->hit_timeout, 1);
>>>>>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
>>>>>> +		wake_up_process(ctx->submitter_task);
>>>>>> +	else
>>>>>> +		io_cqring_wake(ctx);
>>>>>
>>>>> This is a bit different to schedule_hrtimeout_range_clock(). Why is
>>>>> io_cqring_wake() needed here for non-DEFER_TASKRUN?
>>>>
>>>> That's how the wakeups work - for defer taskrun, the task isn't on a
>>>> waitqueue at all. Hence we need to wake the task itself. For any other
>>>> setup, they will be on the waitqueue, and we just call io_cqring_wake()
>>>> to wake up anyone waiting on the waitqueue. That will iterate the wake
>>>> queue and call handlers for each item. Having a separate handler for
>>>> that will allow to NOT wake up the task if we don't need to.
>>>> taskrun, the waker
>>>
>>> To rephase the question, why is the original code calling
>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>>> between defer taskrun and not?
>>
>> Because that part is the same, the task schedules out and goes to sleep.
>> That has always been the same regardless of how the ring is setup. Only
>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
>> manually.
>>
> 
> io_cqring_timer_wakeup() is the timer expired callback which calls
> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN.
> 
> The original code calling schedule_hrtimeout_range_clock() uses
> hrtimer_sleeper instead, which has a default timer expired callback set
> to hrtimer_wakeup().
> 
> hrtimer_wakeup() only calls wake_up_process().
> 
> My question is: why this asymmetry? Why does the new custom callback
> require io_cqring_wake()?

That's what I'm saying, it doesn't need and doesn't really want it.
 From the correctness point of view, it's ok since we wake up a
(unnecessarily) larger set of tasks.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 22:13             ` Pavel Begunkov
@ 2024-08-20 22:14               ` David Wei
  2024-08-20 22:19                 ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: David Wei @ 2024-08-20 22:14 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

On 2024-08-20 15:13, Pavel Begunkov wrote:
>>>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>> To rephase the question, why is the original code calling
>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>>>> between defer taskrun and not?
>>>
>>> Because that part is the same, the task schedules out and goes to sleep.
>>> That has always been the same regardless of how the ring is setup. Only
>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
>>> manually.
>>>
>>
>> io_cqring_timer_wakeup() is the timer expired callback which calls
>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN.
>>
>> The original code calling schedule_hrtimeout_range_clock() uses
>> hrtimer_sleeper instead, which has a default timer expired callback set
>> to hrtimer_wakeup().
>>
>> hrtimer_wakeup() only calls wake_up_process().
>>
>> My question is: why this asymmetry? Why does the new custom callback
>> require io_cqring_wake()?
> 
> That's what I'm saying, it doesn't need and doesn't really want it.
> From the correctness point of view, it's ok since we wake up a
> (unnecessarily) larger set of tasks.
> 

Yeah your explanation that came in while I was writing the email
answered it, thanks Pavel.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 22:14               ` David Wei
@ 2024-08-20 22:19                 ` Jens Axboe
  2024-08-20 22:51                   ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-20 22:19 UTC (permalink / raw)
  To: David Wei, Pavel Begunkov, io-uring

On 8/20/24 4:14 PM, David Wei wrote:
> On 2024-08-20 15:13, Pavel Begunkov wrote:
>>>>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>>> To rephase the question, why is the original code calling
>>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>>>>> between defer taskrun and not?
>>>>
>>>> Because that part is the same, the task schedules out and goes to sleep.
>>>> That has always been the same regardless of how the ring is setup. Only
>>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
>>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
>>>> manually.
>>>>
>>>
>>> io_cqring_timer_wakeup() is the timer expired callback which calls
>>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN.
>>>
>>> The original code calling schedule_hrtimeout_range_clock() uses
>>> hrtimer_sleeper instead, which has a default timer expired callback set
>>> to hrtimer_wakeup().
>>>
>>> hrtimer_wakeup() only calls wake_up_process().
>>>
>>> My question is: why this asymmetry? Why does the new custom callback
>>> require io_cqring_wake()?
>>
>> That's what I'm saying, it doesn't need and doesn't really want it.
>> From the correctness point of view, it's ok since we wake up a
>> (unnecessarily) larger set of tasks.
>>
> 
> Yeah your explanation that came in while I was writing the email
> answered it, thanks Pavel.

Hah and now I see what you meant - yeah we can just remove the
distinction. I didn't see anything in testing, but I also didn't have
multiple tasks waiting on a ring, nor would you. So it doesn't really
matter, but I'll clean it up so there's no confusion.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 22:19                 ` Jens Axboe
@ 2024-08-20 22:51                   ` Jens Axboe
  2024-08-20 22:54                     ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-20 22:51 UTC (permalink / raw)
  To: David Wei, Pavel Begunkov, io-uring

On 8/20/24 4:19 PM, Jens Axboe wrote:
> On 8/20/24 4:14 PM, David Wei wrote:
>> On 2024-08-20 15:13, Pavel Begunkov wrote:
>>>>>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>>>> To rephase the question, why is the original code calling
>>>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>>>>>> between defer taskrun and not?
>>>>>
>>>>> Because that part is the same, the task schedules out and goes to sleep.
>>>>> That has always been the same regardless of how the ring is setup. Only
>>>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
>>>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
>>>>> manually.
>>>>>
>>>>
>>>> io_cqring_timer_wakeup() is the timer expired callback which calls
>>>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN.
>>>>
>>>> The original code calling schedule_hrtimeout_range_clock() uses
>>>> hrtimer_sleeper instead, which has a default timer expired callback set
>>>> to hrtimer_wakeup().
>>>>
>>>> hrtimer_wakeup() only calls wake_up_process().
>>>>
>>>> My question is: why this asymmetry? Why does the new custom callback
>>>> require io_cqring_wake()?
>>>
>>> That's what I'm saying, it doesn't need and doesn't really want it.
>>> From the correctness point of view, it's ok since we wake up a
>>> (unnecessarily) larger set of tasks.
>>>
>>
>> Yeah your explanation that came in while I was writing the email
>> answered it, thanks Pavel.
> 
> Hah and now I see what you meant - yeah we can just remove the
> distinction. I didn't see anything in testing, but I also didn't have
> multiple tasks waiting on a ring, nor would you. So it doesn't really
> matter, but I'll clean it up so there's no confusion.

Actually probably better to just leave it as-is, as we'd otherwise need
to store a task in io_wait_queue. Which we of course could, and would
remove a branch in there...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-20 22:51                   ` Jens Axboe
@ 2024-08-20 22:54                     ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-20 22:54 UTC (permalink / raw)
  To: David Wei, Pavel Begunkov, io-uring

On 8/20/24 4:51 PM, Jens Axboe wrote:
> On 8/20/24 4:19 PM, Jens Axboe wrote:
>> On 8/20/24 4:14 PM, David Wei wrote:
>>> On 2024-08-20 15:13, Pavel Begunkov wrote:
>>>>>>>> On 8/20/24 2:08 PM, David Wei wrote:
>>>>>>> To rephase the question, why is the original code calling
>>>>>>> schedule_hrtimeout_range_clock() not needing to differentiate behaviour
>>>>>>> between defer taskrun and not?
>>>>>>
>>>>>> Because that part is the same, the task schedules out and goes to sleep.
>>>>>> That has always been the same regardless of how the ring is setup. Only
>>>>>> difference is that DEFER_TASKRUN doesn't add itself to ctx->wait, and
>>>>>> hence cannot be woken by a wake_up(ctx->wait). We have to wake the task
>>>>>> manually.
>>>>>>
>>>>>
>>>>> io_cqring_timer_wakeup() is the timer expired callback which calls
>>>>> wake_up_process() or io_cqring_wake() depending on DEFER_TASKRUN.
>>>>>
>>>>> The original code calling schedule_hrtimeout_range_clock() uses
>>>>> hrtimer_sleeper instead, which has a default timer expired callback set
>>>>> to hrtimer_wakeup().
>>>>>
>>>>> hrtimer_wakeup() only calls wake_up_process().
>>>>>
>>>>> My question is: why this asymmetry? Why does the new custom callback
>>>>> require io_cqring_wake()?
>>>>
>>>> That's what I'm saying, it doesn't need and doesn't really want it.
>>>> From the correctness point of view, it's ok since we wake up a
>>>> (unnecessarily) larger set of tasks.
>>>>
>>>
>>> Yeah your explanation that came in while I was writing the email
>>> answered it, thanks Pavel.
>>
>> Hah and now I see what you meant - yeah we can just remove the
>> distinction. I didn't see anything in testing, but I also didn't have
>> multiple tasks waiting on a ring, nor would you. So it doesn't really
>> matter, but I'll clean it up so there's no confusion.
> 
> Actually probably better to just leave it as-is, as we'd otherwise need
> to store a task in io_wait_queue. Which we of course could, and would
> remove a branch in there...

I guess I should actually look at the code first, we have it via
wq->private already. Hence:


diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ddfbe04c61ed..4ba5292137c3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2353,13 +2353,9 @@ static bool current_pending_io(void)
 static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
 {
 	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
-	struct io_ring_ctx *ctx = iowq->ctx;
 
 	WRITE_ONCE(iowq->hit_timeout, 1);
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
-		wake_up_process(ctx->submitter_task);
-	else
-		io_cqring_wake(ctx);
+	wake_up_process(iowq->wq.private);
 	return HRTIMER_NORESTART;
 }
 

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCHSET v5 0/5] Add support for batched min timeout
@ 2024-08-21 14:16 Jens Axboe
  2024-08-21 14:16 ` [PATCH 1/5] io_uring: encapsulate extraneous wait flags into a separate struct Jens Axboe
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 14:16 UTC (permalink / raw)
  To: io-uring; +Cc: dw

Hi,

Here's v5 of the min-wait patchset. For a full description, see the v2
posting:

https://lore.kernel.org/io-uring/[email protected]/

As before, there's a liburing branch with added test cases, it can be
found here:

https://git.kernel.dk/cgit/liburing/log/?h=min-wait

The patches are on top of master with for-6.12/io_uring pulled in.

Changes since v4:
- Use READ/WRITE_ONCE consistently with iowq->hit_timeout
- Unify how io_cqring_timer_wakeup() handles ring types by always using
  wake_up_process().
- Fix race in min timer wakeup with DEFER_TASKRUN
- Don't reset ctx->cq_wait_nr if timeout has been hit

 include/uapi/linux/io_uring.h |   3 +-
 io_uring/io_uring.c           | 191 +++++++++++++++++++++++++---------
 io_uring/io_uring.h           |   4 +
 3 files changed, 150 insertions(+), 48 deletions(-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/5] io_uring: encapsulate extraneous wait flags into a separate struct
  2024-08-21 14:16 [PATCHSET v5 0/5] Add support for batched min timeout Jens Axboe
@ 2024-08-21 14:16 ` Jens Axboe
  2024-08-21 14:16 ` [PATCH 2/5] io_uring: move schedule wait logic into helper Jens Axboe
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 14:16 UTC (permalink / raw)
  To: io-uring; +Cc: dw, Jens Axboe

Rather than need to pass in 2 or 3 separate arguments, add a struct
to encapsulate the timeout and sigset_t parts of waiting. In preparation
for adding another argument for waiting.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 20229e72b65c..37053d32c668 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2384,13 +2384,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+struct ext_arg {
+	size_t argsz;
+	struct __kernel_timespec __user *ts;
+	const sigset_t __user *sig;
+};
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
-			  const sigset_t __user *sig, size_t sigsz,
-			  struct __kernel_timespec __user *uts)
+			  struct ext_arg *ext_arg)
 {
 	struct io_wait_queue iowq;
 	struct io_rings *rings = ctx->rings;
@@ -2415,10 +2420,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
 	iowq.timeout = KTIME_MAX;
 
-	if (uts) {
+	if (ext_arg->ts) {
 		struct timespec64 ts;
 
-		if (get_timespec64(&ts, uts))
+		if (get_timespec64(&ts, ext_arg->ts))
 			return -EFAULT;
 
 		iowq.timeout = timespec64_to_ktime(ts);
@@ -2426,14 +2431,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
 	}
 
-	if (sig) {
+	if (ext_arg->sig) {
 #ifdef CONFIG_COMPAT
 		if (in_compat_syscall())
-			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
-						      sigsz);
+			ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
+						      ext_arg->argsz);
 		else
 #endif
-			ret = set_user_sigmask(sig, sigsz);
+			ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
 
 		if (ret)
 			return ret;
@@ -3112,9 +3117,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a
 	return 0;
 }
 
-static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
-			  struct __kernel_timespec __user **ts,
-			  const sigset_t __user **sig)
+static int io_get_ext_arg(unsigned flags, const void __user *argp,
+			  struct ext_arg *ext_arg)
 {
 	struct io_uring_getevents_arg arg;
 
@@ -3123,8 +3127,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
 	 * is just a pointer to the sigset_t.
 	 */
 	if (!(flags & IORING_ENTER_EXT_ARG)) {
-		*sig = (const sigset_t __user *) argp;
-		*ts = NULL;
+		ext_arg->sig = (const sigset_t __user *) argp;
+		ext_arg->ts = NULL;
 		return 0;
 	}
 
@@ -3132,15 +3136,15 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
 	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
 	 * timespec and sigset_t pointers if good.
 	 */
-	if (*argsz != sizeof(arg))
+	if (ext_arg->argsz != sizeof(arg))
 		return -EINVAL;
 	if (copy_from_user(&arg, argp, sizeof(arg)))
 		return -EFAULT;
 	if (arg.pad)
 		return -EINVAL;
-	*sig = u64_to_user_ptr(arg.sigmask);
-	*argsz = arg.sigmask_sz;
-	*ts = u64_to_user_ptr(arg.ts);
+	ext_arg->sig = u64_to_user_ptr(arg.sigmask);
+	ext_arg->argsz = arg.sigmask_sz;
+	ext_arg->ts = u64_to_user_ptr(arg.ts);
 	return 0;
 }
 
@@ -3246,15 +3250,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			}
 			mutex_unlock(&ctx->uring_lock);
 		} else {
-			const sigset_t __user *sig;
-			struct __kernel_timespec __user *ts;
+			struct ext_arg ext_arg = { .argsz = argsz };
 
-			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+			ret2 = io_get_ext_arg(flags, argp, &ext_arg);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
 				ret2 = io_cqring_wait(ctx, min_complete, flags,
-						      sig, argsz, ts);
+						      &ext_arg);
 			}
 		}
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/5] io_uring: move schedule wait logic into helper
  2024-08-21 14:16 [PATCHSET v5 0/5] Add support for batched min timeout Jens Axboe
  2024-08-21 14:16 ` [PATCH 1/5] io_uring: encapsulate extraneous wait flags into a separate struct Jens Axboe
@ 2024-08-21 14:16 ` Jens Axboe
  2024-08-21 14:16 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 14:16 UTC (permalink / raw)
  To: io-uring; +Cc: dw, Jens Axboe

In preparation for expanding how we handle waits, move the actual
schedule and schedule_timeout() handling into a helper.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 37053d32c668..9e2b8d4c05db 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2350,22 +2350,10 @@ static bool current_pending_io(void)
 	return percpu_counter_read_positive(&tctx->inflight);
 }
 
-/* when returns >0, the caller should retry */
-static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
-					  struct io_wait_queue *iowq)
+static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+				     struct io_wait_queue *iowq)
 {
-	int ret;
-
-	if (unlikely(READ_ONCE(ctx->check_cq)))
-		return 1;
-	if (unlikely(!llist_empty(&ctx->work_llist)))
-		return 1;
-	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
-		return 1;
-	if (unlikely(task_sigpending(current)))
-		return -EINTR;
-	if (unlikely(io_should_wake(iowq)))
-		return 0;
+	int ret = 0;
 
 	/*
 	 * Mark us as being in io_wait if we have pending requests, so cpufreq
@@ -2374,7 +2362,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (current_pending_io())
 		current->in_iowait = 1;
-	ret = 0;
 	if (iowq->timeout == KTIME_MAX)
 		schedule();
 	else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0,
@@ -2384,6 +2371,24 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+/* If this returns > 0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+					  struct io_wait_queue *iowq)
+{
+	if (unlikely(READ_ONCE(ctx->check_cq)))
+		return 1;
+	if (unlikely(!llist_empty(&ctx->work_llist)))
+		return 1;
+	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
+		return 1;
+	if (unlikely(task_sigpending(current)))
+		return -EINTR;
+	if (unlikely(io_should_wake(iowq)))
+		return 0;
+
+	return __io_cqring_wait_schedule(ctx, iowq);
+}
+
 struct ext_arg {
 	size_t argsz;
 	struct __kernel_timespec __user *ts;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-21 14:16 [PATCHSET v5 0/5] Add support for batched min timeout Jens Axboe
  2024-08-21 14:16 ` [PATCH 1/5] io_uring: encapsulate extraneous wait flags into a separate struct Jens Axboe
  2024-08-21 14:16 ` [PATCH 2/5] io_uring: move schedule wait logic into helper Jens Axboe
@ 2024-08-21 14:16 ` Jens Axboe
  2024-08-22 13:22   ` Pavel Begunkov
  2024-08-21 14:16 ` [PATCH 4/5] io_uring: add support for batch wait timeout Jens Axboe
  2024-08-21 14:16 ` [PATCH 5/5] io_uring: wire up min batch wake timeout Jens Axboe
  4 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 14:16 UTC (permalink / raw)
  To: io-uring; +Cc: dw, Jens Axboe

In preparation for having two distinct timeouts and avoid waking the
task if we don't need to.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 37 ++++++++++++++++++++++++++++++++-----
 io_uring/io_uring.h |  2 ++
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9e2b8d4c05db..4ba5292137c3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -2350,6 +2350,34 @@ static bool current_pending_io(void)
 	return percpu_counter_read_positive(&tctx->inflight);
 }
 
+static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+
+	WRITE_ONCE(iowq->hit_timeout, 1);
+	wake_up_process(iowq->wq.private);
+	return HRTIMER_NORESTART;
+}
+
+static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
+				      clockid_t clock_id)
+{
+	iowq->hit_timeout = 0;
+	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
+
+	if (!READ_ONCE(iowq->hit_timeout))
+		schedule();
+
+	hrtimer_cancel(&iowq->t);
+	destroy_hrtimer_on_stack(&iowq->t);
+	__set_current_state(TASK_RUNNING);
+
+	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
+}
+
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {
@@ -2362,11 +2390,10 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout == KTIME_MAX)
+	if (iowq->timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
+	else
 		schedule();
-	else if (!schedule_hrtimeout_range_clock(&iowq->timeout, 0,
-						 HRTIMER_MODE_ABS, ctx->clockid))
-		ret = -ETIME;
 	current->in_iowait = 0;
 	return ret;
 }
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9935819f12b7..f95c1b080f4b 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -40,7 +40,9 @@ struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+	int hit_timeout;
 	ktime_t timeout;
+	struct hrtimer t;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	ktime_t napi_busy_poll_dt;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-21 14:16 [PATCHSET v5 0/5] Add support for batched min timeout Jens Axboe
                   ` (2 preceding siblings ...)
  2024-08-21 14:16 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe
@ 2024-08-21 14:16 ` Jens Axboe
  2024-08-21 18:25   ` David Wei
  2024-08-22 13:46   ` Pavel Begunkov
  2024-08-21 14:16 ` [PATCH 5/5] io_uring: wire up min batch wake timeout Jens Axboe
  4 siblings, 2 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 14:16 UTC (permalink / raw)
  To: io-uring; +Cc: dw, Jens Axboe

Waiting for events with io_uring has two knobs that can be set:

1) The number of events to wake for
2) The timeout associated with the event

Waiting will abort when either of those conditions are met, as expected.

This adds support for a third event, which is associated with the number
of events to wait for. Applications generally like to handle batches of
completions, and right now they'd set a number of events to wait for and
the timeout for that. If no events have been received but the timeout
triggers, control is returned to the application and it can wait again.
However, if the application doesn't have anything to do until events are
reaped, then it's possible to make this waiting more efficient.

For example, the application may have a latency time of 50 usecs and
wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
as the timeout, then it'll be doing 20K context switches per second even
if nothing is happening.

This introduces the notion of min batch wait time. If the min batch wait
time expires, then we'll return to userspace if we have any events at all.
If none are available, the general wait time is applied. Any request
arriving after the min batch wait time will cause waiting to stop and
return control to the application.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++-------
 io_uring/io_uring.h |  2 ++
 2 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4ba5292137c3..87e7cf6551d7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx) ||
+	    READ_ONCE(iowq->hit_timeout))
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }
@@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+/*
+ * Doing min_timeout portion. If we saw any timeouts, events, or have work,
+ * wake up. If not, and we have a normal timeout, switch to that and keep
+ * sleeping.
+ */
+static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
+{
+	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/* no general timeout, or shorter, we are done */
+	if (iowq->timeout == KTIME_MAX ||
+	    ktime_after(iowq->min_timeout, iowq->timeout))
+		goto out_wake;
+	/* work we may need to run, wake function will see if we need to wake */
+	if (io_has_work(ctx))
+		goto out_wake;
+	/* got events since we started waiting, min timeout is done */
+	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
+		goto out_wake;
+	/* if we have any events and min timeout expired, we're done */
+	if (io_cqring_events(ctx))
+		goto out_wake;
+
+	/*
+	 * If using deferred task_work running and application is waiting on
+	 * more than one request, ensure we reset it now where we are switching
+	 * to normal sleeps. Any request completion post min_wait should wake
+	 * the task and return.
+	 */
+	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		atomic_set(&ctx->cq_wait_nr, 1);
+		smp_mb();
+		if (!llist_empty(&ctx->work_llist))
+			goto out_wake;
+	}
+
+	iowq->t.function = io_cqring_timer_wakeup;
+	hrtimer_set_expires(timer, iowq->timeout);
+	return HRTIMER_RESTART;
+out_wake:
+	return io_cqring_timer_wakeup(timer);
+}
+
 static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
-				      clockid_t clock_id)
+				      clockid_t clock_id, ktime_t start_time)
 {
-	iowq->hit_timeout = 0;
+	ktime_t timeout;
+
+	WRITE_ONCE(iowq->hit_timeout, 0);
 	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
-	iowq->t.function = io_cqring_timer_wakeup;
-	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
+	if (iowq->min_timeout) {
+		timeout = ktime_add_ns(iowq->min_timeout, start_time);
+		iowq->t.function = io_cqring_min_timer_wakeup;
+	} else {
+		timeout = iowq->timeout;
+		iowq->t.function = io_cqring_timer_wakeup;
+	}
+
+	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
 	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
 
 	if (!READ_ONCE(iowq->hit_timeout))
@@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
 }
 
 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
-				     struct io_wait_queue *iowq)
+				     struct io_wait_queue *iowq,
+				     ktime_t start_time)
 {
 	int ret = 0;
 
@@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 */
 	if (current_pending_io())
 		current->in_iowait = 1;
-	if (iowq->timeout != KTIME_MAX)
-		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
+	if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
+		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
 	else
 		schedule();
 	current->in_iowait = 0;
@@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 
 /* If this returns > 0, the caller should retry */
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
-					  struct io_wait_queue *iowq)
+					  struct io_wait_queue *iowq,
+					  ktime_t start_time)
 {
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	if (unlikely(io_should_wake(iowq)))
 		return 0;
 
-	return __io_cqring_wait_schedule(ctx, iowq);
+	return __io_cqring_wait_schedule(ctx, iowq, start_time);
 }
 
 struct ext_arg {
@@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 {
 	struct io_wait_queue iowq;
 	struct io_rings *rings = ctx->rings;
+	ktime_t start_time;
 	int ret;
 
 	if (!io_allowed_run_tw(ctx))
@@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	INIT_LIST_HEAD(&iowq.wq.entry);
 	iowq.ctx = ctx;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+	iowq.min_timeout = 0;
 	iowq.timeout = KTIME_MAX;
+	start_time = io_get_time(ctx);
 
 	if (ext_arg->ts) {
 		struct timespec64 ts;
@@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 
 		iowq.timeout = timespec64_to_ktime(ts);
 		if (!(flags & IORING_ENTER_ABS_TIMER))
-			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
+			iowq.timeout = ktime_add(iowq.timeout, start_time);
 	}
 
 	if (ext_arg->sig) {
@@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		unsigned long check_cq;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-			atomic_set(&ctx->cq_wait_nr, nr_wait);
+			/* if min timeout has been hit, don't reset wait count */
+			if (!READ_ONCE(iowq.hit_timeout))
+				atomic_set(&ctx->cq_wait_nr, nr_wait);
 			set_current_state(TASK_INTERRUPTIBLE);
 		} else {
 			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 							TASK_INTERRUPTIBLE);
 		}
 
-		ret = io_cqring_wait_schedule(ctx, &iowq);
+		ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
 		__set_current_state(TASK_RUNNING);
 		atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f95c1b080f4b..65078e641390 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -39,8 +39,10 @@ struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
+	unsigned cq_min_tail;
 	unsigned nr_timeouts;
 	int hit_timeout;
+	ktime_t min_timeout;
 	ktime_t timeout;
 	struct hrtimer t;
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 5/5] io_uring: wire up min batch wake timeout
  2024-08-21 14:16 [PATCHSET v5 0/5] Add support for batched min timeout Jens Axboe
                   ` (3 preceding siblings ...)
  2024-08-21 14:16 ` [PATCH 4/5] io_uring: add support for batch wait timeout Jens Axboe
@ 2024-08-21 14:16 ` Jens Axboe
  4 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 14:16 UTC (permalink / raw)
  To: io-uring; +Cc: dw, Jens Axboe

Expose min_wait_usec in io_uring_getevents_arg, replacing the pad member
that is currently in there. The value is in usecs, which is explained in
the name as well.

Note that if min_wait_usec and a normal timeout is used in conjunction,
the normal timeout is still relative to the base time. For example, if
min_wait_usec is set to 100 and the normal timeout is 1000, the max
total time waited is still 1000. This also means that if the normal
timeout is shorter than min_wait_usec, then only the min_wait_usec will
take effect.

See previous commit for an explanation of how this works.

IORING_FEAT_MIN_TIMEOUT is added as a feature flag for this, as
applications doing submit_and_wait_timeout() style operations will
generally not see the -EINVAL from the wait side as they return the
number of IOs submitted. Only if no IOs are submitted will the -EINVAL
bubble back up to the application.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/uapi/linux/io_uring.h | 3 ++-
 io_uring/io_uring.c           | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7af716136df9..042eab793e26 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -543,6 +543,7 @@ struct io_uring_params {
 #define IORING_FEAT_LINKED_FILE		(1U << 12)
 #define IORING_FEAT_REG_REG_RING	(1U << 13)
 #define IORING_FEAT_RECVSEND_BUNDLE	(1U << 14)
+#define IORING_FEAT_MIN_TIMEOUT		(1U << 15)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -766,7 +767,7 @@ enum io_uring_register_restriction_op {
 struct io_uring_getevents_arg {
 	__u64	sigmask;
 	__u32	sigmask_sz;
-	__u32	pad;
+	__u32	min_wait_usec;
 	__u64	ts;
 };
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 87e7cf6551d7..03b226689e20 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2476,6 +2476,7 @@ struct ext_arg {
 	size_t argsz;
 	struct __kernel_timespec __user *ts;
 	const sigset_t __user *sig;
+	ktime_t min_time;
 };
 
 /*
@@ -2508,7 +2509,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
-	iowq.min_timeout = 0;
+	iowq.min_timeout = ext_arg->min_time;
 	iowq.timeout = KTIME_MAX;
 	start_time = io_get_time(ctx);
 
@@ -3234,8 +3235,7 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
 		return -EINVAL;
 	if (copy_from_user(&arg, argp, sizeof(arg)))
 		return -EFAULT;
-	if (arg.pad)
-		return -EINVAL;
+	ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC;
 	ext_arg->sig = u64_to_user_ptr(arg.sigmask);
 	ext_arg->argsz = arg.sigmask_sz;
 	ext_arg->ts = u64_to_user_ptr(arg.ts);
@@ -3636,7 +3636,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
 			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
 			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
-			IORING_FEAT_RECVSEND_BUNDLE;
+			IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-21 14:16 ` [PATCH 4/5] io_uring: add support for batch wait timeout Jens Axboe
@ 2024-08-21 18:25   ` David Wei
  2024-08-21 18:38     ` Jens Axboe
  2024-08-22 13:46   ` Pavel Begunkov
  1 sibling, 1 reply; 29+ messages in thread
From: David Wei @ 2024-08-21 18:25 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 2024-08-21 07:16, Jens Axboe wrote:
> Waiting for events with io_uring has two knobs that can be set:
> 
> 1) The number of events to wake for
> 2) The timeout associated with the event
> 
> Waiting will abort when either of those conditions are met, as expected.
> 
> This adds support for a third event, which is associated with the number
> of events to wait for. Applications generally like to handle batches of
> completions, and right now they'd set a number of events to wait for and
> the timeout for that. If no events have been received but the timeout
> triggers, control is returned to the application and it can wait again.
> However, if the application doesn't have anything to do until events are
> reaped, then it's possible to make this waiting more efficient.
> 
> For example, the application may have a latency time of 50 usecs and
> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
> as the timeout, then it'll be doing 20K context switches per second even
> if nothing is happening.
> 
> This introduces the notion of min batch wait time. If the min batch wait
> time expires, then we'll return to userspace if we have any events at all.
> If none are available, the general wait time is applied. Any request
> arriving after the min batch wait time will cause waiting to stop and
> return control to the application.
> 
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>  io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++-------
>  io_uring/io_uring.h |  2 ++
>  2 files changed, 77 insertions(+), 13 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 4ba5292137c3..87e7cf6551d7 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>  	 * the task, and the next invocation will do it.
>  	 */
> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) ||
> +	    READ_ONCE(iowq->hit_timeout))
>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>  	return -1;
>  }
> @@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>  	return HRTIMER_NORESTART;
>  }
>  
> +/*
> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
> + * wake up. If not, and we have a normal timeout, switch to that and keep
> + * sleeping.
> + */
> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
> +{
> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/* no general timeout, or shorter, we are done */
> +	if (iowq->timeout == KTIME_MAX ||
> +	    ktime_after(iowq->min_timeout, iowq->timeout))
> +		goto out_wake;
> +	/* work we may need to run, wake function will see if we need to wake */
> +	if (io_has_work(ctx))
> +		goto out_wake;
> +	/* got events since we started waiting, min timeout is done */
> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
> +		goto out_wake;
> +	/* if we have any events and min timeout expired, we're done */
> +	if (io_cqring_events(ctx))
> +		goto out_wake;
> +
> +	/*
> +	 * If using deferred task_work running and application is waiting on
> +	 * more than one request, ensure we reset it now where we are switching
> +	 * to normal sleeps. Any request completion post min_wait should wake
> +	 * the task and return.
> +	 */
> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
> +		atomic_set(&ctx->cq_wait_nr, 1);
> +		smp_mb();
> +		if (!llist_empty(&ctx->work_llist))
> +			goto out_wake;
> +	}
> +
> +	iowq->t.function = io_cqring_timer_wakeup;
> +	hrtimer_set_expires(timer, iowq->timeout);

What happens if timeout < min_timeout? Would the timer expired callback
io_cqring_timer_wakeup() be called right away?

> +	return HRTIMER_RESTART;
> +out_wake:
> +	return io_cqring_timer_wakeup(timer);
> +}
> +
>  static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
> -				      clockid_t clock_id)
> +				      clockid_t clock_id, ktime_t start_time)
>  {
> -	iowq->hit_timeout = 0;
> +	ktime_t timeout;
> +
> +	WRITE_ONCE(iowq->hit_timeout, 0);
>  	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
> -	iowq->t.function = io_cqring_timer_wakeup;
> -	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
> +	if (iowq->min_timeout) {
> +		timeout = ktime_add_ns(iowq->min_timeout, start_time);
> +		iowq->t.function = io_cqring_min_timer_wakeup;
> +	} else {
> +		timeout = iowq->timeout;
> +		iowq->t.function = io_cqring_timer_wakeup;
> +	}
> +
> +	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
>  	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
>  
>  	if (!READ_ONCE(iowq->hit_timeout))
> @@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>  }
>  
>  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
> -				     struct io_wait_queue *iowq)
> +				     struct io_wait_queue *iowq,
> +				     ktime_t start_time)
>  {
>  	int ret = 0;
>  
> @@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>  	 */
>  	if (current_pending_io())
>  		current->in_iowait = 1;
> -	if (iowq->timeout != KTIME_MAX)
> -		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
> +	if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
> +		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);

In this case it is possible for either timeout or min_timeout to be
KTIME_MAX and still schedule a timeout.

If min_timeout != KTIME_MAX and timeout == KTIME_MAX, then
io_cqring_min_timer_wakeup() will reset itself to a timer with
KTIME_MAX.

If min_timeout == KTIME_MAX and timeout != KTIME_MAX, then a KTIME_MAX
timer will be set.

This should be fine, the timer will never expire and schedule() is
called regardless. The previous code is a small optimisation to avoid
setting up a timer that will never expire.

>  	else
>  		schedule();
>  	current->in_iowait = 0;
> @@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>  
>  /* If this returns > 0, the caller should retry */
>  static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
> -					  struct io_wait_queue *iowq)
> +					  struct io_wait_queue *iowq,
> +					  ktime_t start_time)
>  {
>  	if (unlikely(READ_ONCE(ctx->check_cq)))
>  		return 1;
> @@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>  	if (unlikely(io_should_wake(iowq)))
>  		return 0;
>  
> -	return __io_cqring_wait_schedule(ctx, iowq);
> +	return __io_cqring_wait_schedule(ctx, iowq, start_time);
>  }
>  
>  struct ext_arg {
> @@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>  {
>  	struct io_wait_queue iowq;
>  	struct io_rings *rings = ctx->rings;
> +	ktime_t start_time;
>  	int ret;
>  
>  	if (!io_allowed_run_tw(ctx))
> @@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>  	INIT_LIST_HEAD(&iowq.wq.entry);
>  	iowq.ctx = ctx;
>  	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
>  	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
> +	iowq.min_timeout = 0;
>  	iowq.timeout = KTIME_MAX;
> +	start_time = io_get_time(ctx);
>  
>  	if (ext_arg->ts) {
>  		struct timespec64 ts;
> @@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>  
>  		iowq.timeout = timespec64_to_ktime(ts);
>  		if (!(flags & IORING_ENTER_ABS_TIMER))
> -			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
> +			iowq.timeout = ktime_add(iowq.timeout, start_time);
>  	}
>  
>  	if (ext_arg->sig) {
> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>  		unsigned long check_cq;
>  
>  		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
> -			atomic_set(&ctx->cq_wait_nr, nr_wait);
> +			/* if min timeout has been hit, don't reset wait count */
> +			if (!READ_ONCE(iowq.hit_timeout))
> +				atomic_set(&ctx->cq_wait_nr, nr_wait);

Only the two timeout expired callback functions
io_cqring_min_timer_wakeup() and io_cqring_timer_wakeup() sets
hit_timeout to 1. In this case, io_cqring_schedule_timeout() would
return -ETIME and the do {...} while(1) loop in io_cqring_wait() would
break. So I'm not sure if it is possible to reach here with hit_timeout
= 1.

Also, in the first iteration of the loop, hit_timeout is init to 0
inside of io_cqring_wait_schedule() -> __io_cqring_wait_schedule() ->
io_cqring_schedule_timeout(). So it is possible for hit_timeout to be
READ_ONCE before it is initialised. If this code is kept we should init
iowq.hit_timeout = 0 above.

>  			set_current_state(TASK_INTERRUPTIBLE);
>  		} else {
>  			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
>  							TASK_INTERRUPTIBLE);
>  		}
>  
> -		ret = io_cqring_wait_schedule(ctx, &iowq);
> +		ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
>  		__set_current_state(TASK_RUNNING);
>  		atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
>  
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index f95c1b080f4b..65078e641390 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -39,8 +39,10 @@ struct io_wait_queue {
>  	struct wait_queue_entry wq;
>  	struct io_ring_ctx *ctx;
>  	unsigned cq_tail;
> +	unsigned cq_min_tail;
>  	unsigned nr_timeouts;
>  	int hit_timeout;
> +	ktime_t min_timeout;
>  	ktime_t timeout;
>  	struct hrtimer t;
>  

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-21 18:25   ` David Wei
@ 2024-08-21 18:38     ` Jens Axboe
  2024-08-21 18:54       ` David Wei
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-21 18:38 UTC (permalink / raw)
  To: David Wei, io-uring

On 8/21/24 12:25 PM, David Wei wrote:
> On 2024-08-21 07:16, Jens Axboe wrote:
>> Waiting for events with io_uring has two knobs that can be set:
>>
>> 1) The number of events to wake for
>> 2) The timeout associated with the event
>>
>> Waiting will abort when either of those conditions are met, as expected.
>>
>> This adds support for a third event, which is associated with the number
>> of events to wait for. Applications generally like to handle batches of
>> completions, and right now they'd set a number of events to wait for and
>> the timeout for that. If no events have been received but the timeout
>> triggers, control is returned to the application and it can wait again.
>> However, if the application doesn't have anything to do until events are
>> reaped, then it's possible to make this waiting more efficient.
>>
>> For example, the application may have a latency time of 50 usecs and
>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>> as the timeout, then it'll be doing 20K context switches per second even
>> if nothing is happening.
>>
>> This introduces the notion of min batch wait time. If the min batch wait
>> time expires, then we'll return to userspace if we have any events at all.
>> If none are available, the general wait time is applied. Any request
>> arriving after the min batch wait time will cause waiting to stop and
>> return control to the application.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
>> ---
>>  io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++-------
>>  io_uring/io_uring.h |  2 ++
>>  2 files changed, 77 insertions(+), 13 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 4ba5292137c3..87e7cf6551d7 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>  	 * the task, and the next invocation will do it.
>>  	 */
>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) ||
>> +	    READ_ONCE(iowq->hit_timeout))
>>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>>  	return -1;
>>  }
>> @@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>  	return HRTIMER_NORESTART;
>>  }
>>  
>> +/*
>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>> + * sleeping.
>> + */
>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>> +{
>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>> +	struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +	/* no general timeout, or shorter, we are done */
>> +	if (iowq->timeout == KTIME_MAX ||
>> +	    ktime_after(iowq->min_timeout, iowq->timeout))
>> +		goto out_wake;
>> +	/* work we may need to run, wake function will see if we need to wake */
>> +	if (io_has_work(ctx))
>> +		goto out_wake;
>> +	/* got events since we started waiting, min timeout is done */
>> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>> +		goto out_wake;
>> +	/* if we have any events and min timeout expired, we're done */
>> +	if (io_cqring_events(ctx))
>> +		goto out_wake;
>> +
>> +	/*
>> +	 * If using deferred task_work running and application is waiting on
>> +	 * more than one request, ensure we reset it now where we are switching
>> +	 * to normal sleeps. Any request completion post min_wait should wake
>> +	 * the task and return.
>> +	 */
>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>> +		atomic_set(&ctx->cq_wait_nr, 1);
>> +		smp_mb();
>> +		if (!llist_empty(&ctx->work_llist))
>> +			goto out_wake;
>> +	}
>> +
>> +	iowq->t.function = io_cqring_timer_wakeup;
>> +	hrtimer_set_expires(timer, iowq->timeout);
> 
> What happens if timeout < min_timeout? Would the timer expired callback
> io_cqring_timer_wakeup() be called right away?

See the test cases, test/min-timeout-wait.c has various cases like that
to ensure that they work. But the first check in this function is for
timeout not being set, or being smaller to the min_timeout.

>> +	return HRTIMER_RESTART;
>> +out_wake:
>> +	return io_cqring_timer_wakeup(timer);
>> +}
>> +
>>  static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>> -				      clockid_t clock_id)
>> +				      clockid_t clock_id, ktime_t start_time)
>>  {
>> -	iowq->hit_timeout = 0;
>> +	ktime_t timeout;
>> +
>> +	WRITE_ONCE(iowq->hit_timeout, 0);
>>  	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
>> -	iowq->t.function = io_cqring_timer_wakeup;
>> -	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
>> +	if (iowq->min_timeout) {
>> +		timeout = ktime_add_ns(iowq->min_timeout, start_time);
>> +		iowq->t.function = io_cqring_min_timer_wakeup;
>> +	} else {
>> +		timeout = iowq->timeout;
>> +		iowq->t.function = io_cqring_timer_wakeup;
>> +	}
>> +
>> +	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
>>  	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
>>  
>>  	if (!READ_ONCE(iowq->hit_timeout))
>> @@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>>  }
>>  
>>  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>> -				     struct io_wait_queue *iowq)
>> +				     struct io_wait_queue *iowq,
>> +				     ktime_t start_time)
>>  {
>>  	int ret = 0;
>>  
>> @@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>  	 */
>>  	if (current_pending_io())
>>  		current->in_iowait = 1;
>> -	if (iowq->timeout != KTIME_MAX)
>> -		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
>> +	if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
>> +		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
> 
> In this case it is possible for either timeout or min_timeout to be
> KTIME_MAX and still schedule a timeout.
> 
> If min_timeout != KTIME_MAX and timeout == KTIME_MAX, then
> io_cqring_min_timer_wakeup() will reset itself to a timer with
> KTIME_MAX.
> 
> If min_timeout == KTIME_MAX and timeout != KTIME_MAX, then a KTIME_MAX
> timer will be set.
> 
> This should be fine, the timer will never expire and schedule() is
> called regardless. The previous code is a small optimisation to avoid
> setting up a timer that will never expire.

We should not be setting up a timer if both min-timeout and regular
timeout are not given. Am I missing something? If either is set, we need
a timer to wake us up. If neither is set, we should not be setting up a
timer, we just need to call schedule().

>> @@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>  
>>  /* If this returns > 0, the caller should retry */
>>  static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>> -					  struct io_wait_queue *iowq)
>> +					  struct io_wait_queue *iowq,
>> +					  ktime_t start_time)
>>  {
>>  	if (unlikely(READ_ONCE(ctx->check_cq)))
>>  		return 1;
>> @@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>  	if (unlikely(io_should_wake(iowq)))
>>  		return 0;
>>  
>> -	return __io_cqring_wait_schedule(ctx, iowq);
>> +	return __io_cqring_wait_schedule(ctx, iowq, start_time);
>>  }
>>  
>>  struct ext_arg {
>> @@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>  {
>>  	struct io_wait_queue iowq;
>>  	struct io_rings *rings = ctx->rings;
>> +	ktime_t start_time;
>>  	int ret;
>>  
>>  	if (!io_allowed_run_tw(ctx))
>> @@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>  	INIT_LIST_HEAD(&iowq.wq.entry);
>>  	iowq.ctx = ctx;
>>  	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
>> +	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
>>  	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
>> +	iowq.min_timeout = 0;
>>  	iowq.timeout = KTIME_MAX;
>> +	start_time = io_get_time(ctx);
>>  
>>  	if (ext_arg->ts) {
>>  		struct timespec64 ts;
>> @@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>  
>>  		iowq.timeout = timespec64_to_ktime(ts);
>>  		if (!(flags & IORING_ENTER_ABS_TIMER))
>> -			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
>> +			iowq.timeout = ktime_add(iowq.timeout, start_time);
>>  	}
>>  
>>  	if (ext_arg->sig) {
>> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>  		unsigned long check_cq;
>>  
>>  		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>> -			atomic_set(&ctx->cq_wait_nr, nr_wait);
>> +			/* if min timeout has been hit, don't reset wait count */
>> +			if (!READ_ONCE(iowq.hit_timeout))
>> +				atomic_set(&ctx->cq_wait_nr, nr_wait);
> 
> Only the two timeout expired callback functions
> io_cqring_min_timer_wakeup() and io_cqring_timer_wakeup() sets
> hit_timeout to 1. In this case, io_cqring_schedule_timeout() would
> return -ETIME and the do {...} while(1) loop in io_cqring_wait() would
> break. So I'm not sure if it is possible to reach here with hit_timeout
> = 1.
> 
> Also, in the first iteration of the loop, hit_timeout is init to 0
> inside of io_cqring_wait_schedule() -> __io_cqring_wait_schedule() ->
> io_cqring_schedule_timeout(). So it is possible for hit_timeout to be
> READ_ONCE before it is initialised. If this code is kept we should init
> iowq.hit_timeout = 0 above.

Yeah we probably should initialize it. The issue here isn't really if a
timer woke us up, it's if the task got woken by something else and loop
around for another retry. If that coincides with the timeout hitting,
then we should not re-set ->cq_wait_nr as it should've been already set
to 1 so any request being added will wake us up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-21 18:38     ` Jens Axboe
@ 2024-08-21 18:54       ` David Wei
  0 siblings, 0 replies; 29+ messages in thread
From: David Wei @ 2024-08-21 18:54 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 2024-08-21 11:38, Jens Axboe wrote:
> On 8/21/24 12:25 PM, David Wei wrote:
>> On 2024-08-21 07:16, Jens Axboe wrote:
>>> Waiting for events with io_uring has two knobs that can be set:
>>>
>>> 1) The number of events to wake for
>>> 2) The timeout associated with the event
>>>
>>> Waiting will abort when either of those conditions are met, as expected.
>>>
>>> This adds support for a third event, which is associated with the number
>>> of events to wait for. Applications generally like to handle batches of
>>> completions, and right now they'd set a number of events to wait for and
>>> the timeout for that. If no events have been received but the timeout
>>> triggers, control is returned to the application and it can wait again.
>>> However, if the application doesn't have anything to do until events are
>>> reaped, then it's possible to make this waiting more efficient.
>>>
>>> For example, the application may have a latency time of 50 usecs and
>>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>>> as the timeout, then it'll be doing 20K context switches per second even
>>> if nothing is happening.
>>>
>>> This introduces the notion of min batch wait time. If the min batch wait
>>> time expires, then we'll return to userspace if we have any events at all.
>>> If none are available, the general wait time is applied. Any request
>>> arriving after the min batch wait time will cause waiting to stop and
>>> return control to the application.
>>>
>>> Signed-off-by: Jens Axboe <[email protected]>
>>> ---
>>>  io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++-------
>>>  io_uring/io_uring.h |  2 ++
>>>  2 files changed, 77 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> index 4ba5292137c3..87e7cf6551d7 100644
>>> --- a/io_uring/io_uring.c
>>> +++ b/io_uring/io_uring.c
>>> @@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>  	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>>  	 * the task, and the next invocation will do it.
>>>  	 */
>>> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>>> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) ||
>>> +	    READ_ONCE(iowq->hit_timeout))
>>>  		return autoremove_wake_function(curr, mode, wake_flags, key);
>>>  	return -1;
>>>  }
>>> @@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>>  	return HRTIMER_NORESTART;
>>>  }
>>>  
>>> +/*
>>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>>> + * sleeping.
>>> + */
>>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>>> +{
>>> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>> +
>>> +	/* no general timeout, or shorter, we are done */
>>> +	if (iowq->timeout == KTIME_MAX ||
>>> +	    ktime_after(iowq->min_timeout, iowq->timeout))
>>> +		goto out_wake;
>>> +	/* work we may need to run, wake function will see if we need to wake */
>>> +	if (io_has_work(ctx))
>>> +		goto out_wake;
>>> +	/* got events since we started waiting, min timeout is done */
>>> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>>> +		goto out_wake;
>>> +	/* if we have any events and min timeout expired, we're done */
>>> +	if (io_cqring_events(ctx))
>>> +		goto out_wake;
>>> +
>>> +	/*
>>> +	 * If using deferred task_work running and application is waiting on
>>> +	 * more than one request, ensure we reset it now where we are switching
>>> +	 * to normal sleeps. Any request completion post min_wait should wake
>>> +	 * the task and return.
>>> +	 */
>>> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>>> +		atomic_set(&ctx->cq_wait_nr, 1);
>>> +		smp_mb();
>>> +		if (!llist_empty(&ctx->work_llist))
>>> +			goto out_wake;
>>> +	}
>>> +
>>> +	iowq->t.function = io_cqring_timer_wakeup;
>>> +	hrtimer_set_expires(timer, iowq->timeout);
>>
>> What happens if timeout < min_timeout? Would the timer expired callback
>> io_cqring_timer_wakeup() be called right away?
> 
> See the test cases, test/min-timeout-wait.c has various cases like that
> to ensure that they work. But the first check in this function is for
> timeout not being set, or being smaller to the min_timeout.
> 
>>> +	return HRTIMER_RESTART;
>>> +out_wake:
>>> +	return io_cqring_timer_wakeup(timer);
>>> +}
>>> +
>>>  static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>>> -				      clockid_t clock_id)
>>> +				      clockid_t clock_id, ktime_t start_time)
>>>  {
>>> -	iowq->hit_timeout = 0;
>>> +	ktime_t timeout;
>>> +
>>> +	WRITE_ONCE(iowq->hit_timeout, 0);
>>>  	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
>>> -	iowq->t.function = io_cqring_timer_wakeup;
>>> -	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
>>> +	if (iowq->min_timeout) {
>>> +		timeout = ktime_add_ns(iowq->min_timeout, start_time);
>>> +		iowq->t.function = io_cqring_min_timer_wakeup;
>>> +	} else {
>>> +		timeout = iowq->timeout;
>>> +		iowq->t.function = io_cqring_timer_wakeup;
>>> +	}
>>> +
>>> +	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
>>>  	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
>>>  
>>>  	if (!READ_ONCE(iowq->hit_timeout))
>>> @@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>>>  }
>>>  
>>>  static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>> -				     struct io_wait_queue *iowq)
>>> +				     struct io_wait_queue *iowq,
>>> +				     ktime_t start_time)
>>>  {
>>>  	int ret = 0;
>>>  
>>> @@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>>  	 */
>>>  	if (current_pending_io())
>>>  		current->in_iowait = 1;
>>> -	if (iowq->timeout != KTIME_MAX)
>>> -		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
>>> +	if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
>>> +		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
>>
>> In this case it is possible for either timeout or min_timeout to be
>> KTIME_MAX and still schedule a timeout.
>>
>> If min_timeout != KTIME_MAX and timeout == KTIME_MAX, then
>> io_cqring_min_timer_wakeup() will reset itself to a timer with
>> KTIME_MAX.
>>
>> If min_timeout == KTIME_MAX and timeout != KTIME_MAX, then a KTIME_MAX
>> timer will be set.
>>
>> This should be fine, the timer will never expire and schedule() is
>> called regardless. The previous code is a small optimisation to avoid
>> setting up a timer that will never expire.
> 
> We should not be setting up a timer if both min-timeout and regular
> timeout are not given. Am I missing something? If either is set, we need
> a timer to wake us up. If neither is set, we should not be setting up a
> timer, we just need to call schedule().

Yeah, mostly talking to myself. If min_timeout == KTIME_MAX and timeout
is valid then we end up setting a timer that would never expire. I think
this is one case where scheduling a timer can be skipped. But I don't
think it will matter.

> 
>>> @@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>>  
>>>  /* If this returns > 0, the caller should retry */
>>>  static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>> -					  struct io_wait_queue *iowq)
>>> +					  struct io_wait_queue *iowq,
>>> +					  ktime_t start_time)
>>>  {
>>>  	if (unlikely(READ_ONCE(ctx->check_cq)))
>>>  		return 1;
>>> @@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>>  	if (unlikely(io_should_wake(iowq)))
>>>  		return 0;
>>>  
>>> -	return __io_cqring_wait_schedule(ctx, iowq);
>>> +	return __io_cqring_wait_schedule(ctx, iowq, start_time);
>>>  }
>>>  
>>>  struct ext_arg {
>>> @@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>  {
>>>  	struct io_wait_queue iowq;
>>>  	struct io_rings *rings = ctx->rings;
>>> +	ktime_t start_time;
>>>  	int ret;
>>>  
>>>  	if (!io_allowed_run_tw(ctx))
>>> @@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>  	INIT_LIST_HEAD(&iowq.wq.entry);
>>>  	iowq.ctx = ctx;
>>>  	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
>>> +	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
>>>  	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
>>> +	iowq.min_timeout = 0;
>>>  	iowq.timeout = KTIME_MAX;
>>> +	start_time = io_get_time(ctx);
>>>  
>>>  	if (ext_arg->ts) {
>>>  		struct timespec64 ts;
>>> @@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>  
>>>  		iowq.timeout = timespec64_to_ktime(ts);
>>>  		if (!(flags & IORING_ENTER_ABS_TIMER))
>>> -			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
>>> +			iowq.timeout = ktime_add(iowq.timeout, start_time);
>>>  	}
>>>  
>>>  	if (ext_arg->sig) {
>>> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>  		unsigned long check_cq;
>>>  
>>>  		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>>> -			atomic_set(&ctx->cq_wait_nr, nr_wait);
>>> +			/* if min timeout has been hit, don't reset wait count */
>>> +			if (!READ_ONCE(iowq.hit_timeout))
>>> +				atomic_set(&ctx->cq_wait_nr, nr_wait);
>>
>> Only the two timeout expired callback functions
>> io_cqring_min_timer_wakeup() and io_cqring_timer_wakeup() sets
>> hit_timeout to 1. In this case, io_cqring_schedule_timeout() would
>> return -ETIME and the do {...} while(1) loop in io_cqring_wait() would
>> break. So I'm not sure if it is possible to reach here with hit_timeout
>> = 1.
>>
>> Also, in the first iteration of the loop, hit_timeout is init to 0
>> inside of io_cqring_wait_schedule() -> __io_cqring_wait_schedule() ->
>> io_cqring_schedule_timeout(). So it is possible for hit_timeout to be
>> READ_ONCE before it is initialised. If this code is kept we should init
>> iowq.hit_timeout = 0 above.
> 
> Yeah we probably should initialize it. The issue here isn't really if a
> timer woke us up, it's if the task got woken by something else and loop
> around for another retry. If that coincides with the timeout hitting,
> then we should not re-set ->cq_wait_nr as it should've been already set
> to 1 so any request being added will wake us up.

Ahh right, the so called 'spurious wakeups'. The timer may run after the
task is woken up by something else, and before the timer is cancelled.
In this case the task should definitely not touch cq_wait_nr if the
timer callback already set it to 1!

> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-21 14:16 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe
@ 2024-08-22 13:22   ` Pavel Begunkov
  2024-08-22 15:27     ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Pavel Begunkov @ 2024-08-22 13:22 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: dw

On 8/21/24 15:16, Jens Axboe wrote:
> In preparation for having two distinct timeouts and avoid waking the
> task if we don't need to.
> 
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>   io_uring/io_uring.c | 37 ++++++++++++++++++++++++++++++++-----
>   io_uring/io_uring.h |  2 ++
>   2 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 9e2b8d4c05db..4ba5292137c3 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>   	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>   	 * the task, and the next invocation will do it.
>   	 */
> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)

Shouldn't be needed. If the timer fires, it should wake the task,
and the task will check ->hit_timeout there and later remove the
itself from the waitqueue.

>   		return autoremove_wake_function(curr, mode, wake_flags, key);
>   	return -1;
>   }
> @@ -2350,6 +2350,34 @@ static bool current_pending_io(void)
>   	return percpu_counter_read_positive(&tctx->inflight);
>   }
...

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-21 14:16 ` [PATCH 4/5] io_uring: add support for batch wait timeout Jens Axboe
  2024-08-21 18:25   ` David Wei
@ 2024-08-22 13:46   ` Pavel Begunkov
  2024-08-22 15:37     ` Jens Axboe
  1 sibling, 1 reply; 29+ messages in thread
From: Pavel Begunkov @ 2024-08-22 13:46 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: dw

On 8/21/24 15:16, Jens Axboe wrote:
> Waiting for events with io_uring has two knobs that can be set:
> 
> 1) The number of events to wake for
> 2) The timeout associated with the event
> 
> Waiting will abort when either of those conditions are met, as expected.
> 
> This adds support for a third event, which is associated with the number
> of events to wait for. Applications generally like to handle batches of
> completions, and right now they'd set a number of events to wait for and
> the timeout for that. If no events have been received but the timeout
> triggers, control is returned to the application and it can wait again.
> However, if the application doesn't have anything to do until events are
> reaped, then it's possible to make this waiting more efficient.
> 
> For example, the application may have a latency time of 50 usecs and
> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
> as the timeout, then it'll be doing 20K context switches per second even
> if nothing is happening.
> 
> This introduces the notion of min batch wait time. If the min batch wait
> time expires, then we'll return to userspace if we have any events at all.
> If none are available, the general wait time is applied. Any request
> arriving after the min batch wait time will cause waiting to stop and
> return control to the application.
> 
> Signed-off-by: Jens Axboe <[email protected]>
> ---
>   io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++-------
>   io_uring/io_uring.h |  2 ++
>   2 files changed, 77 insertions(+), 13 deletions(-)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 4ba5292137c3..87e7cf6551d7 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>   	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
>   	 * the task, and the next invocation will do it.
>   	 */
> -	if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
> +	if (io_should_wake(iowq) || io_has_work(iowq->ctx) ||
> +	    READ_ONCE(iowq->hit_timeout))
>   		return autoremove_wake_function(curr, mode, wake_flags, key);
>   	return -1;
>   }
> @@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>   	return HRTIMER_NORESTART;
>   }
>   
> +/*
> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
> + * wake up. If not, and we have a normal timeout, switch to that and keep
> + * sleeping.
> + */
> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
> +{
> +	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/* no general timeout, or shorter, we are done */
> +	if (iowq->timeout == KTIME_MAX ||
> +	    ktime_after(iowq->min_timeout, iowq->timeout))
> +		goto out_wake;
> +	/* work we may need to run, wake function will see if we need to wake */
> +	if (io_has_work(ctx))
> +		goto out_wake;
> +	/* got events since we started waiting, min timeout is done */
> +	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
> +		goto out_wake;
> +	/* if we have any events and min timeout expired, we're done */
> +	if (io_cqring_events(ctx))
> +		goto out_wake;
> +
> +	/*
> +	 * If using deferred task_work running and application is waiting on
> +	 * more than one request, ensure we reset it now where we are switching
> +	 * to normal sleeps. Any request completion post min_wait should wake
> +	 * the task and return.
> +	 */
> +	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
> +		atomic_set(&ctx->cq_wait_nr, 1);
> +		smp_mb();
> +		if (!llist_empty(&ctx->work_llist))
> +			goto out_wake;
> +	}
> +
> +	iowq->t.function = io_cqring_timer_wakeup;
> +	hrtimer_set_expires(timer, iowq->timeout);
> +	return HRTIMER_RESTART;
> +out_wake:
> +	return io_cqring_timer_wakeup(timer);
> +}
> +
>   static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
> -				      clockid_t clock_id)
> +				      clockid_t clock_id, ktime_t start_time)
>   {
> -	iowq->hit_timeout = 0;
> +	ktime_t timeout;
> +
> +	WRITE_ONCE(iowq->hit_timeout, 0);
>   	hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
> -	iowq->t.function = io_cqring_timer_wakeup;
> -	hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
> +	if (iowq->min_timeout) {

What's the default, 0 or KTIME_MAX? __io_cqring_wait_schedule()
checks KTIME_MAX instead.

It likely needs to account for hit_timeout. Not looking deep into
the new callback, but imagine that it expired and you promoted the
timeout to the next stage (long wait). Then you get a spurious wake
up, it cancels timeouts, loops in io_cqring_wait() and gets back to
schedule timeout. Since nothing modified ->min_timeout it'll
try a short timeout again.


> +		timeout = ktime_add_ns(iowq->min_timeout, start_time);
> +		iowq->t.function = io_cqring_min_timer_wakeup;
> +	} else {
> +		timeout = iowq->timeout;
> +		iowq->t.function = io_cqring_timer_wakeup;
> +	}
> +
> +	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
>   	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
>   
>   	if (!READ_ONCE(iowq->hit_timeout))
> @@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>   }
>   
>   static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
> -				     struct io_wait_queue *iowq)
> +				     struct io_wait_queue *iowq,
> +				     ktime_t start_time)
>   {
>   	int ret = 0;
>   
> @@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>   	 */
>   	if (current_pending_io())
>   		current->in_iowait = 1;
> -	if (iowq->timeout != KTIME_MAX)
> -		ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
> +	if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
> +		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
>   	else
>   		schedule();
>   	current->in_iowait = 0;
> @@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>   
>   /* If this returns > 0, the caller should retry */
>   static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
> -					  struct io_wait_queue *iowq)
> +					  struct io_wait_queue *iowq,
> +					  ktime_t start_time)
>   {
>   	if (unlikely(READ_ONCE(ctx->check_cq)))
>   		return 1;
> @@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>   	if (unlikely(io_should_wake(iowq)))
>   		return 0;
>   
> -	return __io_cqring_wait_schedule(ctx, iowq);
> +	return __io_cqring_wait_schedule(ctx, iowq, start_time);
>   }
>   
>   struct ext_arg {
> @@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>   {
>   	struct io_wait_queue iowq;
>   	struct io_rings *rings = ctx->rings;
> +	ktime_t start_time;
>   	int ret;
>   
>   	if (!io_allowed_run_tw(ctx))
> @@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>   	INIT_LIST_HEAD(&iowq.wq.entry);
>   	iowq.ctx = ctx;
>   	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
>   	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
> +	iowq.min_timeout = 0;
>   	iowq.timeout = KTIME_MAX;
> +	start_time = io_get_time(ctx);
>   
>   	if (ext_arg->ts) {
>   		struct timespec64 ts;
> @@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>   
>   		iowq.timeout = timespec64_to_ktime(ts);
>   		if (!(flags & IORING_ENTER_ABS_TIMER))
> -			iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
> +			iowq.timeout = ktime_add(iowq.timeout, start_time);
>   	}
>   
>   	if (ext_arg->sig) {
> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>   		unsigned long check_cq;
>   
>   		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
> -			atomic_set(&ctx->cq_wait_nr, nr_wait);
> +			/* if min timeout has been hit, don't reset wait count */
> +			if (!READ_ONCE(iowq.hit_timeout))

Why read once? You're out of io_cqring_schedule_timeout(),
timers are cancelled and everything should've been synchronised
by this point.

FWIW, it was also fine setting it to 1 in the timer callback
for the same reason. However...


> +				atomic_set(&ctx->cq_wait_nr, nr_wait);

if (hit_timeout)
	nr_wait = 1;
atomic_set(cq_wait_nr, nr_wait);

otherwise, you're risking not to be ever woken up
ever again for this wait by tw.


>   			set_current_state(TASK_INTERRUPTIBLE);
>   		} else {
>   			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
>   							TASK_INTERRUPTIBLE);
>   		}
>   
> -		ret = io_cqring_wait_schedule(ctx, &iowq);
> +		ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
>   		__set_current_state(TASK_RUNNING);
>   		atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
>   
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index f95c1b080f4b..65078e641390 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -39,8 +39,10 @@ struct io_wait_queue {
>   	struct wait_queue_entry wq;
>   	struct io_ring_ctx *ctx;
>   	unsigned cq_tail;
> +	unsigned cq_min_tail;
>   	unsigned nr_timeouts;
>   	int hit_timeout;
> +	ktime_t min_timeout;
>   	ktime_t timeout;
>   	struct hrtimer t;
>   

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/5] io_uring: implement our own schedule timeout handling
  2024-08-22 13:22   ` Pavel Begunkov
@ 2024-08-22 15:27     ` Jens Axboe
  0 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2024-08-22 15:27 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: dw

On 8/22/24 7:22 AM, Pavel Begunkov wrote:
> On 8/21/24 15:16, Jens Axboe wrote:
>> In preparation for having two distinct timeouts and avoid waking the
>> task if we don't need to.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
>> ---
>>   io_uring/io_uring.c | 37 ++++++++++++++++++++++++++++++++-----
>>   io_uring/io_uring.h |  2 ++
>>   2 files changed, 34 insertions(+), 5 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 9e2b8d4c05db..4ba5292137c3 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2322,7 +2322,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>        * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>        * the task, and the next invocation will do it.
>>        */
>> -    if (io_should_wake(iowq) || io_has_work(iowq->ctx))
>> +    if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
> 
> Shouldn't be needed. If the timer fires, it should wake the task,
> and the task will check ->hit_timeout there and later remove the
> itself from the waitqueue.

Good point indeed, I'll kill it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-22 13:46   ` Pavel Begunkov
@ 2024-08-22 15:37     ` Jens Axboe
  2024-08-22 16:06       ` Pavel Begunkov
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-22 15:37 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: dw

On 8/22/24 7:46 AM, Pavel Begunkov wrote:
> On 8/21/24 15:16, Jens Axboe wrote:
>> Waiting for events with io_uring has two knobs that can be set:
>>
>> 1) The number of events to wake for
>> 2) The timeout associated with the event
>>
>> Waiting will abort when either of those conditions are met, as expected.
>>
>> This adds support for a third event, which is associated with the number
>> of events to wait for. Applications generally like to handle batches of
>> completions, and right now they'd set a number of events to wait for and
>> the timeout for that. If no events have been received but the timeout
>> triggers, control is returned to the application and it can wait again.
>> However, if the application doesn't have anything to do until events are
>> reaped, then it's possible to make this waiting more efficient.
>>
>> For example, the application may have a latency time of 50 usecs and
>> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs
>> as the timeout, then it'll be doing 20K context switches per second even
>> if nothing is happening.
>>
>> This introduces the notion of min batch wait time. If the min batch wait
>> time expires, then we'll return to userspace if we have any events at all.
>> If none are available, the general wait time is applied. Any request
>> arriving after the min batch wait time will cause waiting to stop and
>> return control to the application.
>>
>> Signed-off-by: Jens Axboe <[email protected]>
>> ---
>>   io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++-------
>>   io_uring/io_uring.h |  2 ++
>>   2 files changed, 77 insertions(+), 13 deletions(-)
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 4ba5292137c3..87e7cf6551d7 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>        * Cannot safely flush overflowed CQEs from here, ensure we wake up
>>        * the task, and the next invocation will do it.
>>        */
>> -    if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout)
>> +    if (io_should_wake(iowq) || io_has_work(iowq->ctx) ||
>> +        READ_ONCE(iowq->hit_timeout))
>>           return autoremove_wake_function(curr, mode, wake_flags, key);
>>       return -1;
>>   }
>> @@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
>>       return HRTIMER_NORESTART;
>>   }
>>   +/*
>> + * Doing min_timeout portion. If we saw any timeouts, events, or have work,
>> + * wake up. If not, and we have a normal timeout, switch to that and keep
>> + * sleeping.
>> + */
>> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>> +{
>> +    struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
>> +    struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +    /* no general timeout, or shorter, we are done */
>> +    if (iowq->timeout == KTIME_MAX ||
>> +        ktime_after(iowq->min_timeout, iowq->timeout))
>> +        goto out_wake;
>> +    /* work we may need to run, wake function will see if we need to wake */
>> +    if (io_has_work(ctx))
>> +        goto out_wake;
>> +    /* got events since we started waiting, min timeout is done */
>> +    if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
>> +        goto out_wake;
>> +    /* if we have any events and min timeout expired, we're done */
>> +    if (io_cqring_events(ctx))
>> +        goto out_wake;
>> +
>> +    /*
>> +     * If using deferred task_work running and application is waiting on
>> +     * more than one request, ensure we reset it now where we are switching
>> +     * to normal sleeps. Any request completion post min_wait should wake
>> +     * the task and return.
>> +     */
>> +    if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>> +        atomic_set(&ctx->cq_wait_nr, 1);
>> +        smp_mb();
>> +        if (!llist_empty(&ctx->work_llist))
>> +            goto out_wake;
>> +    }
>> +
>> +    iowq->t.function = io_cqring_timer_wakeup;
>> +    hrtimer_set_expires(timer, iowq->timeout);
>> +    return HRTIMER_RESTART;
>> +out_wake:
>> +    return io_cqring_timer_wakeup(timer);
>> +}
>> +
>>   static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>> -                      clockid_t clock_id)
>> +                      clockid_t clock_id, ktime_t start_time)
>>   {
>> -    iowq->hit_timeout = 0;
>> +    ktime_t timeout;
>> +
>> +    WRITE_ONCE(iowq->hit_timeout, 0);
>>       hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS);
>> -    iowq->t.function = io_cqring_timer_wakeup;
>> -    hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0);
>> +    if (iowq->min_timeout) {
> 
> What's the default, 0 or KTIME_MAX? __io_cqring_wait_schedule()
> checks KTIME_MAX instead.

In practice either one works, but let's keep it consistent - since it's
a relative value (eg you ask for xx usec), I'll change the one that
checks for KTIME_MAX to just check if it's set.

> It likely needs to account for hit_timeout. Not looking deep into
> the new callback, but imagine that it expired and you promoted the
> timeout to the next stage (long wait). Then you get a spurious wake
> up, it cancels timeouts, loops in io_cqring_wait() and gets back to
> schedule timeout. Since nothing modified ->min_timeout it'll
> try a short timeout again.

Yeah good point, we don't want to redo it for that case. With
hit_timeout being set earlier now, we can just check it in here.

>> @@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
>>   }
>>     static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>> -                     struct io_wait_queue *iowq)
>> +                     struct io_wait_queue *iowq,
>> +                     ktime_t start_time)
>>   {
>>       int ret = 0;
>>   @@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>        */
>>       if (current_pending_io())
>>           current->in_iowait = 1;
>> -    if (iowq->timeout != KTIME_MAX)
>> -        ret = io_cqring_schedule_timeout(iowq, ctx->clockid);
>> +    if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX)
>> +        ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
>>       else
>>           schedule();
>>       current->in_iowait = 0;
>> @@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>     /* If this returns > 0, the caller should retry */
>>   static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>> -                      struct io_wait_queue *iowq)
>> +                      struct io_wait_queue *iowq,
>> +                      ktime_t start_time)
>>   {
>>       if (unlikely(READ_ONCE(ctx->check_cq)))
>>           return 1;
>> @@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
>>       if (unlikely(io_should_wake(iowq)))
>>           return 0;
>>   -    return __io_cqring_wait_schedule(ctx, iowq);
>> +    return __io_cqring_wait_schedule(ctx, iowq, start_time);
>>   }
>>     struct ext_arg {
>> @@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>   {
>>       struct io_wait_queue iowq;
>>       struct io_rings *rings = ctx->rings;
>> +    ktime_t start_time;
>>       int ret;
>>         if (!io_allowed_run_tw(ctx))
>> @@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>       INIT_LIST_HEAD(&iowq.wq.entry);
>>       iowq.ctx = ctx;
>>       iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
>> +    iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
>>       iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
>> +    iowq.min_timeout = 0;
>>       iowq.timeout = KTIME_MAX;
>> +    start_time = io_get_time(ctx);
>>         if (ext_arg->ts) {
>>           struct timespec64 ts;
>> @@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>             iowq.timeout = timespec64_to_ktime(ts);
>>           if (!(flags & IORING_ENTER_ABS_TIMER))
>> -            iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx));
>> +            iowq.timeout = ktime_add(iowq.timeout, start_time);
>>       }
>>         if (ext_arg->sig) {
>> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>           unsigned long check_cq;
>>             if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>> -            atomic_set(&ctx->cq_wait_nr, nr_wait);
>> +            /* if min timeout has been hit, don't reset wait count */
>> +            if (!READ_ONCE(iowq.hit_timeout))
> 
> Why read once? You're out of io_cqring_schedule_timeout(),
> timers are cancelled and everything should've been synchronised
> by this point.

Just for consistency's sake.

>> +                atomic_set(&ctx->cq_wait_nr, nr_wait);
> 
> if (hit_timeout)
>     nr_wait = 1;
> atomic_set(cq_wait_nr, nr_wait);
> 
> otherwise, you're risking not to be ever woken up
> ever again for this wait by tw.

Good point, I'll init nr_wait rather than check here.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-22 15:37     ` Jens Axboe
@ 2024-08-22 16:06       ` Pavel Begunkov
  2024-08-22 16:14         ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Pavel Begunkov @ 2024-08-22 16:06 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: dw

On 8/22/24 16:37, Jens Axboe wrote:
> On 8/22/24 7:46 AM, Pavel Begunkov wrote:
>> On 8/21/24 15:16, Jens Axboe wrote:
...
>>>          if (ext_arg->sig) {
>>> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>            unsigned long check_cq;
>>>              if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>>> -            atomic_set(&ctx->cq_wait_nr, nr_wait);
>>> +            /* if min timeout has been hit, don't reset wait count */
>>> +            if (!READ_ONCE(iowq.hit_timeout))
>>
>> Why read once? You're out of io_cqring_schedule_timeout(),
>> timers are cancelled and everything should've been synchronised
>> by this point.
> 
> Just for consistency's sake.

Please drop it. Sync primitives tell a story, and this one says
that it's racing with something when it's not. It's always hard to
work with code with unnecessary protection. If it has to change in
the future the first question asked would be why read once is there,
what does it try to achieve / protect and if it's safe to kill it.
It'll also hide real races from sanitizers.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-22 16:06       ` Pavel Begunkov
@ 2024-08-22 16:14         ` Jens Axboe
  2024-08-22 16:24           ` Pavel Begunkov
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2024-08-22 16:14 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: dw

On 8/22/24 10:06 AM, Pavel Begunkov wrote:
> On 8/22/24 16:37, Jens Axboe wrote:
>> On 8/22/24 7:46 AM, Pavel Begunkov wrote:
>>> On 8/21/24 15:16, Jens Axboe wrote:
> ...
>>>>          if (ext_arg->sig) {
>>>> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>>            unsigned long check_cq;
>>>>              if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>>>> -            atomic_set(&ctx->cq_wait_nr, nr_wait);
>>>> +            /* if min timeout has been hit, don't reset wait count */
>>>> +            if (!READ_ONCE(iowq.hit_timeout))
>>>
>>> Why read once? You're out of io_cqring_schedule_timeout(),
>>> timers are cancelled and everything should've been synchronised
>>> by this point.
>>
>> Just for consistency's sake.
> 
> Please drop it. Sync primitives tell a story, and this one says
> that it's racing with something when it's not. It's always hard to
> work with code with unnecessary protection. If it has to change in
> the future the first question asked would be why read once is there,
> what does it try to achieve / protect and if it's safe to kill it.
> It'll also hide real races from sanitizers.

Sure I don't disagree, I'll kill it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 4/5] io_uring: add support for batch wait timeout
  2024-08-22 16:14         ` Jens Axboe
@ 2024-08-22 16:24           ` Pavel Begunkov
  0 siblings, 0 replies; 29+ messages in thread
From: Pavel Begunkov @ 2024-08-22 16:24 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: dw

On 8/22/24 17:14, Jens Axboe wrote:
> On 8/22/24 10:06 AM, Pavel Begunkov wrote:
>> On 8/22/24 16:37, Jens Axboe wrote:
>>> On 8/22/24 7:46 AM, Pavel Begunkov wrote:
>>>> On 8/21/24 15:16, Jens Axboe wrote:
>> ...
>>>>>           if (ext_arg->sig) {
>>>>> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>>>>>             unsigned long check_cq;
>>>>>               if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>>>>> -            atomic_set(&ctx->cq_wait_nr, nr_wait);
>>>>> +            /* if min timeout has been hit, don't reset wait count */
>>>>> +            if (!READ_ONCE(iowq.hit_timeout))
>>>>
>>>> Why read once? You're out of io_cqring_schedule_timeout(),
>>>> timers are cancelled and everything should've been synchronised
>>>> by this point.
>>>
>>> Just for consistency's sake.
>>
>> Please drop it. Sync primitives tell a story, and this one says
>> that it's racing with something when it's not. It's always hard to
>> work with code with unnecessary protection. If it has to change in
>> the future the first question asked would be why read once is there,
>> what does it try to achieve / protect and if it's safe to kill it.
>> It'll also hide real races from sanitizers.
> 
> Sure I don't disagree, I'll kill it.

Thanks. Personal trauma, especially after tracking down some chunks
of code back to 2.6 with no explanation nor author to ask.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2024-08-22 16:24 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-21 14:16 [PATCHSET v5 0/5] Add support for batched min timeout Jens Axboe
2024-08-21 14:16 ` [PATCH 1/5] io_uring: encapsulate extraneous wait flags into a separate struct Jens Axboe
2024-08-21 14:16 ` [PATCH 2/5] io_uring: move schedule wait logic into helper Jens Axboe
2024-08-21 14:16 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe
2024-08-22 13:22   ` Pavel Begunkov
2024-08-22 15:27     ` Jens Axboe
2024-08-21 14:16 ` [PATCH 4/5] io_uring: add support for batch wait timeout Jens Axboe
2024-08-21 18:25   ` David Wei
2024-08-21 18:38     ` Jens Axboe
2024-08-21 18:54       ` David Wei
2024-08-22 13:46   ` Pavel Begunkov
2024-08-22 15:37     ` Jens Axboe
2024-08-22 16:06       ` Pavel Begunkov
2024-08-22 16:14         ` Jens Axboe
2024-08-22 16:24           ` Pavel Begunkov
2024-08-21 14:16 ` [PATCH 5/5] io_uring: wire up min batch wake timeout Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2024-08-19 23:28 [PATCHSET v4 0/5] Add support for batched min timeout Jens Axboe
2024-08-19 23:28 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe
2024-08-20 20:08   ` David Wei
2024-08-20 21:34     ` Jens Axboe
2024-08-20 21:37       ` David Wei
2024-08-20 21:39         ` Jens Axboe
2024-08-20 22:04           ` Pavel Begunkov
2024-08-20 22:06           ` David Wei
2024-08-20 22:13             ` Pavel Begunkov
2024-08-20 22:14               ` David Wei
2024-08-20 22:19                 ` Jens Axboe
2024-08-20 22:51                   ` Jens Axboe
2024-08-20 22:54                     ` Jens Axboe
2024-08-16 20:38 [PATCHSET v3 0/5] Add support for batched min timeout Jens Axboe
2024-08-16 20:38 ` [PATCH 3/5] io_uring: implement our own schedule timeout handling Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox