[RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns

public inbox for [email protected]
 help / color / mirror / Atom feed

* [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
@ 2022-04-03 11:45 Ming Lei
  2022-04-04 16:51 ` Mike Snitzer
  2022-04-06  2:20 ` Jens Axboe
  0 siblings, 2 replies; 9+ messages in thread
From: Ming Lei @ 2022-04-03 11:45 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: Ming Lei, Mike Snitzer

-EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
io_iopoll_check doesn't handle this situation, and io hang can be caused.

Current dm io polling may return -EAGAIN after bio submission is
returned, also blk-throttle might trigger this situation too.

Cc: Mike Snitzer <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
 fs/io-wq.h    |  13 +++++
 fs/io_uring.c | 128 ++++++++++++++++++++++++++++----------------------
 2 files changed, 86 insertions(+), 55 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index dbecd27656c7..4ca4863664fb 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -96,6 +96,19 @@ static inline void wq_list_add_head(struct io_wq_work_node *node,
 	WRITE_ONCE(list->first, node);
 }
 
+static inline void wq_list_remove(struct io_wq_work_list *list,
+				  struct io_wq_work_node *prev,
+				  struct io_wq_work_node *node)
+{
+	if (!prev)
+		WRITE_ONCE(list->first, node->next);
+	else
+		prev->next = node->next;
+
+	if (node == list->last)
+		list->last = prev;
+}
+
 static inline void wq_list_cut(struct io_wq_work_list *list,
 			       struct io_wq_work_node *last,
 			       struct io_wq_work_node *prev)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 59e54a6854b7..6db5514e10ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2759,6 +2759,65 @@ static inline bool io_run_task_work(void)
 	return false;
 }
 
+#ifdef CONFIG_BLOCK
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	struct io_async_rw *rw = req->async_data;
+
+	if (!req_has_async_data(req))
+		return !io_req_prep_async(req);
+	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
+	return true;
+}
+
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+	umode_t mode = file_inode(req->file)->i_mode;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!S_ISBLK(mode) && !S_ISREG(mode))
+		return false;
+	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+	    !(ctx->flags & IORING_SETUP_IOPOLL)))
+		return false;
+	/*
+	 * If ref is dying, we might be running poll reap from the exit work.
+	 * Don't attempt to reissue from that path, just let it fail with
+	 * -EAGAIN.
+	 */
+	if (percpu_ref_is_dying(&ctx->refs))
+		return false;
+	/*
+	 * Play it safe and assume not safe to re-import and reissue if we're
+	 * not in the original thread group (or in task context).
+	 */
+	if (!same_thread_group(req->task, current) || !in_task())
+		return false;
+	return true;
+}
+#else
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+	return false;
+}
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+	return false;
+}
+#endif
+
+static void do_io_reissue(struct io_kiocb *req, int ret)
+{
+	if (req->flags & REQ_F_REISSUE) {
+		req->flags &= ~REQ_F_REISSUE;
+		if (io_resubmit_prep(req))
+			io_req_task_queue_reissue(req);
+		else
+			io_req_task_queue_fail(req, ret);
+	}
+}
+
+
 static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -2786,6 +2845,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
+		/*
+		 * Once REISSUE flag is set, the req has been done, and we
+		 * have to retry
+		 */
+		if (req->flags & REQ_F_REISSUE)
+			break;
+
 		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
 		if (unlikely(ret < 0))
 			return ret;
@@ -2807,6 +2873,12 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	wq_list_for_each_resume(pos, prev) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 
+		if (req->flags & REQ_F_REISSUE) {
+			wq_list_remove(&ctx->iopoll_list, prev, pos);
+			do_io_reissue(req, -EIO);
+			break;
+		}
+
 		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
 		if (!smp_load_acquire(&req->iopoll_completed))
 			break;
@@ -2924,53 +2996,6 @@ static void kiocb_end_write(struct io_kiocb *req)
 	}
 }
 
-#ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
-	struct io_async_rw *rw = req->async_data;
-
-	if (!req_has_async_data(req))
-		return !io_req_prep_async(req);
-	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
-	return true;
-}
-
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
-	umode_t mode = file_inode(req->file)->i_mode;
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (!S_ISBLK(mode) && !S_ISREG(mode))
-		return false;
-	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
-	    !(ctx->flags & IORING_SETUP_IOPOLL)))
-		return false;
-	/*
-	 * If ref is dying, we might be running poll reap from the exit work.
-	 * Don't attempt to reissue from that path, just let it fail with
-	 * -EAGAIN.
-	 */
-	if (percpu_ref_is_dying(&ctx->refs))
-		return false;
-	/*
-	 * Play it safe and assume not safe to re-import and reissue if we're
-	 * not in the original thread group (or in task context).
-	 */
-	if (!same_thread_group(req->task, current) || !in_task())
-		return false;
-	return true;
-}
-#else
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
-	return false;
-}
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
-	return false;
-}
-#endif
-
 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
 	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
@@ -3264,14 +3289,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 		__io_complete_rw(req, ret, issue_flags);
 	else
 		io_rw_done(&req->rw.kiocb, ret);
-
-	if (req->flags & REQ_F_REISSUE) {
-		req->flags &= ~REQ_F_REISSUE;
-		if (io_resubmit_prep(req))
-			io_req_task_queue_reissue(req);
-		else
-			io_req_task_queue_fail(req, ret);
-	}
+	do_io_reissue(req, ret);
 }
 
 static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-03 11:45 [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns Ming Lei
@ 2022-04-04 16:51 ` Mike Snitzer
  2022-04-06  2:09   ` Ming Lei
  2022-04-06  2:20 ` Jens Axboe
  1 sibling, 1 reply; 9+ messages in thread
From: Mike Snitzer @ 2022-04-04 16:51 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, io-uring, dm-devel

On Sun, Apr 03 2022 at  7:45P -0400,
Ming Lei <[email protected]> wrote:

> -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> io_iopoll_check doesn't handle this situation, and io hang can be caused.
> 
> Current dm io polling may return -EAGAIN after bio submission is
> returned, also blk-throttle might trigger this situation too.
> 
> Cc: Mike Snitzer <[email protected]>
> Signed-off-by: Ming Lei <[email protected]>

I first reverted commit 5291984004ed ("dm: fix bio polling to handle
possibile BLK_STS_AGAIN") then applied this patch and verified this
fixes the DM bio polling hangs.  Nice work!

But interestingly with this fio test (against dm-linear ontop of
null_blk with queue_mode=2 submit_queues=8 poll_queues=2 bs=4096 gb=16):

fio --bs=4096 --ioengine=io_uring --fixedbufs --registerfiles --hipri=1 \
--iodepth=16 --iodepth_batch_submit=16 --iodepth_batch_complete_min=16 \
--filename=/dev/mapper/linear --direct=1 --runtime=20 --numjobs=16 \
--rw=randread --name=test --group_reporting --norandommap

I get 3186k IOPS with your patch to have io_uring retry (and commit
5291984004ed reverted), but 4305k IOPS if leave commit 5291984004ed
applied (and DM resorts to retrying any -EAGAIN _without_ polling).

Jens rightly pointed out to me that polling tests that exhaust tags
are bogus anyway (because such unbounded IO defeats the point of
polling).  Jens also thinks my result, with commit 5291984004ed
applied, is somehow bogus and not to be trusted ;)  He is very likely
correct, and the failing likely in the null_blk driver -- I'm
skeptical of that driver given it cannot pass fio verify testing
(e.g. --do_verify=1 --verify=crc32c --verify_async=1) with or without
polling.

Review comments inlined below.

> ---
>  fs/io-wq.h    |  13 +++++
>  fs/io_uring.c | 128 ++++++++++++++++++++++++++++----------------------
>  2 files changed, 86 insertions(+), 55 deletions(-)
> 
> diff --git a/fs/io-wq.h b/fs/io-wq.h
> index dbecd27656c7..4ca4863664fb 100644
> --- a/fs/io-wq.h
> +++ b/fs/io-wq.h
> @@ -96,6 +96,19 @@ static inline void wq_list_add_head(struct io_wq_work_node *node,
>  	WRITE_ONCE(list->first, node);
>  }
>  
> +static inline void wq_list_remove(struct io_wq_work_list *list,
> +				  struct io_wq_work_node *prev,
> +				  struct io_wq_work_node *node)
> +{
> +	if (!prev)
> +		WRITE_ONCE(list->first, node->next);
> +	else
> +		prev->next = node->next;
> +
> +	if (node == list->last)
> +		list->last = prev;
> +}
> +
>  static inline void wq_list_cut(struct io_wq_work_list *list,
>  			       struct io_wq_work_node *last,
>  			       struct io_wq_work_node *prev)
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 59e54a6854b7..6db5514e10ca 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2759,6 +2759,65 @@ static inline bool io_run_task_work(void)
>  	return false;
>  }
>  
> +#ifdef CONFIG_BLOCK
> +static bool io_resubmit_prep(struct io_kiocb *req)
> +{
> +	struct io_async_rw *rw = req->async_data;
> +
> +	if (!req_has_async_data(req))
> +		return !io_req_prep_async(req);
> +	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
> +	return true;
> +}
> +
> +static bool io_rw_should_reissue(struct io_kiocb *req)
> +{
> +	umode_t mode = file_inode(req->file)->i_mode;
> +	struct io_ring_ctx *ctx = req->ctx;
> +
> +	if (!S_ISBLK(mode) && !S_ISREG(mode))
> +		return false;
> +	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
> +	    !(ctx->flags & IORING_SETUP_IOPOLL)))
> +		return false;
> +	/*
> +	 * If ref is dying, we might be running poll reap from the exit work.
> +	 * Don't attempt to reissue from that path, just let it fail with
> +	 * -EAGAIN.
> +	 */
> +	if (percpu_ref_is_dying(&ctx->refs))
> +		return false;
> +	/*
> +	 * Play it safe and assume not safe to re-import and reissue if we're
> +	 * not in the original thread group (or in task context).
> +	 */
> +	if (!same_thread_group(req->task, current) || !in_task())
> +		return false;
> +	return true;
> +}
> +#else
> +static bool io_resubmit_prep(struct io_kiocb *req)
> +{
> +	return false;
> +}
> +static bool io_rw_should_reissue(struct io_kiocb *req)
> +{
> +	return false;
> +}
> +#endif
> +
> +static void do_io_reissue(struct io_kiocb *req, int ret)
> +{
> +	if (req->flags & REQ_F_REISSUE) {
> +		req->flags &= ~REQ_F_REISSUE;
> +		if (io_resubmit_prep(req))
> +			io_req_task_queue_reissue(req);
> +		else
> +			io_req_task_queue_fail(req, ret);
> +	}
> +}

Minor nit but: I'd leave caller to check for REQ_F_REISSUE.

> +
> +
>  static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>  {
>  	struct io_wq_work_node *pos, *start, *prev;
> @@ -2786,6 +2845,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>  		if (READ_ONCE(req->iopoll_completed))
>  			break;
>  
> +		/*
> +		 * Once REISSUE flag is set, the req has been done, and we
> +		 * have to retry
> +		 */
> +		if (req->flags & REQ_F_REISSUE)
> +			break;
> +
>  		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
>  		if (unlikely(ret < 0))
>  			return ret;
> @@ -2807,6 +2873,12 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
>  	wq_list_for_each_resume(pos, prev) {
>  		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
>  
> +		if (req->flags & REQ_F_REISSUE) {
> +			wq_list_remove(&ctx->iopoll_list, prev, pos);
> +			do_io_reissue(req, -EIO);
> +			break;
> +		}
> +

That way you'll avoid redundant checks for REQ_F_REISSUE here.

Other than that:

Reviewed-by: Mike Snitzer <[email protected]>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-04 16:51 ` Mike Snitzer
@ 2022-04-06  2:09   ` Ming Lei
  2022-04-06 16:52     ` Mike Snitzer
  0 siblings, 1 reply; 9+ messages in thread
From: Ming Lei @ 2022-04-06  2:09 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: Jens Axboe, io-uring, dm-devel

On Mon, Apr 04, 2022 at 12:51:30PM -0400, Mike Snitzer wrote:
> On Sun, Apr 03 2022 at  7:45P -0400,
> Ming Lei <[email protected]> wrote:
> 
> > -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> > set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> > io_iopoll_check doesn't handle this situation, and io hang can be caused.
> > 
> > Current dm io polling may return -EAGAIN after bio submission is
> > returned, also blk-throttle might trigger this situation too.
> > 
> > Cc: Mike Snitzer <[email protected]>
> > Signed-off-by: Ming Lei <[email protected]>
> 
> I first reverted commit 5291984004ed ("dm: fix bio polling to handle
> possibile BLK_STS_AGAIN") then applied this patch and verified this
> fixes the DM bio polling hangs.  Nice work!
> 
> But interestingly with this fio test (against dm-linear ontop of
> null_blk with queue_mode=2 submit_queues=8 poll_queues=2 bs=4096 gb=16):
> 
> fio --bs=4096 --ioengine=io_uring --fixedbufs --registerfiles --hipri=1 \
> --iodepth=16 --iodepth_batch_submit=16 --iodepth_batch_complete_min=16 \
> --filename=/dev/mapper/linear --direct=1 --runtime=20 --numjobs=16 \
> --rw=randread --name=test --group_reporting --norandommap

16jobs in io_uring/aio test is overkill.

> 
> I get 3186k IOPS with your patch to have io_uring retry (and commit
> 5291984004ed reverted), but 4305k IOPS if leave commit 5291984004ed
> applied (and DM resorts to retrying any -EAGAIN _without_ polling).

IMO, commit 5291984004ed shouldn't be reverted, which is reasonable to
retry on underlying IO for dm.

This patch is for making io_uring more reliable, since the current
io_uring code only handles -EAGAIN from submission code path, and
-EAGAIN/REISSUE isn't handled if it is returned during ->poll(),
then the io hang is caused.

Jens, what do you think of this patch? Does io_uring need to handle
-EAGAIN in this case?

> 
> Jens rightly pointed out to me that polling tests that exhaust tags
> are bogus anyway (because such unbounded IO defeats the point of
> polling).  Jens also thinks my result, with commit 5291984004ed
> applied, is somehow bogus and not to be trusted ;)  He is very likely
> correct, and the failing likely in the null_blk driver -- I'm
> skeptical of that driver given it cannot pass fio verify testing
> (e.g. --do_verify=1 --verify=crc32c --verify_async=1) with or without
> polling.

Because it is null block...

> 
> Review comments inlined below.
> 
> > ---
> >  fs/io-wq.h    |  13 +++++
> >  fs/io_uring.c | 128 ++++++++++++++++++++++++++++----------------------
> >  2 files changed, 86 insertions(+), 55 deletions(-)
> > 
> > diff --git a/fs/io-wq.h b/fs/io-wq.h
> > index dbecd27656c7..4ca4863664fb 100644
> > --- a/fs/io-wq.h
> > +++ b/fs/io-wq.h
> > @@ -96,6 +96,19 @@ static inline void wq_list_add_head(struct io_wq_work_node *node,
> >  	WRITE_ONCE(list->first, node);
> >  }
> >  
> > +static inline void wq_list_remove(struct io_wq_work_list *list,
> > +				  struct io_wq_work_node *prev,
> > +				  struct io_wq_work_node *node)
> > +{
> > +	if (!prev)
> > +		WRITE_ONCE(list->first, node->next);
> > +	else
> > +		prev->next = node->next;
> > +
> > +	if (node == list->last)
> > +		list->last = prev;
> > +}
> > +
> >  static inline void wq_list_cut(struct io_wq_work_list *list,
> >  			       struct io_wq_work_node *last,
> >  			       struct io_wq_work_node *prev)
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 59e54a6854b7..6db5514e10ca 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -2759,6 +2759,65 @@ static inline bool io_run_task_work(void)
> >  	return false;
> >  }
> >  
> > +#ifdef CONFIG_BLOCK
> > +static bool io_resubmit_prep(struct io_kiocb *req)
> > +{
> > +	struct io_async_rw *rw = req->async_data;
> > +
> > +	if (!req_has_async_data(req))
> > +		return !io_req_prep_async(req);
> > +	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
> > +	return true;
> > +}
> > +
> > +static bool io_rw_should_reissue(struct io_kiocb *req)
> > +{
> > +	umode_t mode = file_inode(req->file)->i_mode;
> > +	struct io_ring_ctx *ctx = req->ctx;
> > +
> > +	if (!S_ISBLK(mode) && !S_ISREG(mode))
> > +		return false;
> > +	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
> > +	    !(ctx->flags & IORING_SETUP_IOPOLL)))
> > +		return false;
> > +	/*
> > +	 * If ref is dying, we might be running poll reap from the exit work.
> > +	 * Don't attempt to reissue from that path, just let it fail with
> > +	 * -EAGAIN.
> > +	 */
> > +	if (percpu_ref_is_dying(&ctx->refs))
> > +		return false;
> > +	/*
> > +	 * Play it safe and assume not safe to re-import and reissue if we're
> > +	 * not in the original thread group (or in task context).
> > +	 */
> > +	if (!same_thread_group(req->task, current) || !in_task())
> > +		return false;
> > +	return true;
> > +}
> > +#else
> > +static bool io_resubmit_prep(struct io_kiocb *req)
> > +{
> > +	return false;
> > +}
> > +static bool io_rw_should_reissue(struct io_kiocb *req)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +
> > +static void do_io_reissue(struct io_kiocb *req, int ret)
> > +{
> > +	if (req->flags & REQ_F_REISSUE) {
> > +		req->flags &= ~REQ_F_REISSUE;
> > +		if (io_resubmit_prep(req))
> > +			io_req_task_queue_reissue(req);
> > +		else
> > +			io_req_task_queue_fail(req, ret);
> > +	}
> > +}
> 
> Minor nit but: I'd leave caller to check for REQ_F_REISSUE.
> 
> > +
> > +
> >  static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> >  {
> >  	struct io_wq_work_node *pos, *start, *prev;
> > @@ -2786,6 +2845,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> >  		if (READ_ONCE(req->iopoll_completed))
> >  			break;
> >  
> > +		/*
> > +		 * Once REISSUE flag is set, the req has been done, and we
> > +		 * have to retry
> > +		 */
> > +		if (req->flags & REQ_F_REISSUE)
> > +			break;
> > +
> >  		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
> >  		if (unlikely(ret < 0))
> >  			return ret;
> > @@ -2807,6 +2873,12 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> >  	wq_list_for_each_resume(pos, prev) {
> >  		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
> >  
> > +		if (req->flags & REQ_F_REISSUE) {
> > +			wq_list_remove(&ctx->iopoll_list, prev, pos);
> > +			do_io_reissue(req, -EIO);
> > +			break;
> > +		}
> > +
> 
> That way you'll avoid redundant checks for REQ_F_REISSUE here.

Another do_io_reissue() needn't to remove req from ->iopoll_list, that
is why the check is done here.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-06  2:09   ` Ming Lei
@ 2022-04-06 16:52     ` Mike Snitzer
  0 siblings, 0 replies; 9+ messages in thread
From: Mike Snitzer @ 2022-04-06 16:52 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, dm-devel, io-uring

On Tue, Apr 05 2022 at 10:09P -0400,
Ming Lei <[email protected]> wrote:

> On Mon, Apr 04, 2022 at 12:51:30PM -0400, Mike Snitzer wrote:
> > On Sun, Apr 03 2022 at  7:45P -0400,
> > Ming Lei <[email protected]> wrote:
> > 
> > > -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> > > set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> > > io_iopoll_check doesn't handle this situation, and io hang can be caused.
> > > 
> > > Current dm io polling may return -EAGAIN after bio submission is
> > > returned, also blk-throttle might trigger this situation too.
> > > 
> > > Cc: Mike Snitzer <[email protected]>
> > > Signed-off-by: Ming Lei <[email protected]>
> > 
> > I first reverted commit 5291984004ed ("dm: fix bio polling to handle
> > possibile BLK_STS_AGAIN") then applied this patch and verified this
> > fixes the DM bio polling hangs.  Nice work!
> > 
> > But interestingly with this fio test (against dm-linear ontop of
> > null_blk with queue_mode=2 submit_queues=8 poll_queues=2 bs=4096 gb=16):
> > 
> > fio --bs=4096 --ioengine=io_uring --fixedbufs --registerfiles --hipri=1 \
> > --iodepth=16 --iodepth_batch_submit=16 --iodepth_batch_complete_min=16 \
> > --filename=/dev/mapper/linear --direct=1 --runtime=20 --numjobs=16 \
> > --rw=randread --name=test --group_reporting --norandommap
> 
> 16jobs in io_uring/aio test is overkill.

Sure, it is.. I was just exhausting driver resources.. could fiddle
with it so that it exhausts quickly with even a single thread.

Besides the point really.

> > I get 3186k IOPS with your patch to have io_uring retry (and commit
> > 5291984004ed reverted), but 4305k IOPS if leave commit 5291984004ed
> > applied (and DM resorts to retrying any -EAGAIN _without_ polling).
> 
> IMO, commit 5291984004ed shouldn't be reverted, which is reasonable to
> retry on underlying IO for dm.

Right, I wasn't saying commit 5291984004ed should be reverted.  But I
was testing to see if your patch covered the case commit 5291984004ed
handles.

Note that the DM retry isn't in terms of polling though, polling gets
disabled when the bio is requeued to DM.

> This patch is for making io_uring more reliable, since the current
> io_uring code only handles -EAGAIN from submission code path, and
> -EAGAIN/REISSUE isn't handled if it is returned during ->poll(),
> then the io hang is caused.
> 
> Jens, what do you think of this patch? Does io_uring need to handle
> -EAGAIN in this case?
> 
> > 
> > Jens rightly pointed out to me that polling tests that exhaust tags
> > are bogus anyway (because such unbounded IO defeats the point of
> > polling).  Jens also thinks my result, with commit 5291984004ed
> > applied, is somehow bogus and not to be trusted ;)  He is very likely
> > correct, and the failing likely in the null_blk driver -- I'm
> > skeptical of that driver given it cannot pass fio verify testing
> > (e.g. --do_verify=1 --verify=crc32c --verify_async=1) with or without
> > polling.
> 
> Because it is null block...

Ha, yes.. very good point. I was expecting null_blk capability (read
back written data) that it was never intended to provide. Sorry ;)

> > 
> > Review comments inlined below.
> > 
> > > ---
> > >  fs/io-wq.h    |  13 +++++
> > >  fs/io_uring.c | 128 ++++++++++++++++++++++++++++----------------------
> > >  2 files changed, 86 insertions(+), 55 deletions(-)
> > > 
> > > diff --git a/fs/io-wq.h b/fs/io-wq.h
> > > index dbecd27656c7..4ca4863664fb 100644
> > > --- a/fs/io-wq.h
> > > +++ b/fs/io-wq.h
> > > @@ -96,6 +96,19 @@ static inline void wq_list_add_head(struct io_wq_work_node *node,
> > >  	WRITE_ONCE(list->first, node);
> > >  }
> > >  
> > > +static inline void wq_list_remove(struct io_wq_work_list *list,
> > > +				  struct io_wq_work_node *prev,
> > > +				  struct io_wq_work_node *node)
> > > +{
> > > +	if (!prev)
> > > +		WRITE_ONCE(list->first, node->next);
> > > +	else
> > > +		prev->next = node->next;
> > > +
> > > +	if (node == list->last)
> > > +		list->last = prev;
> > > +}
> > > +
> > >  static inline void wq_list_cut(struct io_wq_work_list *list,
> > >  			       struct io_wq_work_node *last,
> > >  			       struct io_wq_work_node *prev)
> > > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > > index 59e54a6854b7..6db5514e10ca 100644
> > > --- a/fs/io_uring.c
> > > +++ b/fs/io_uring.c
> > > @@ -2759,6 +2759,65 @@ static inline bool io_run_task_work(void)
> > >  	return false;
> > >  }
> > >  
> > > +#ifdef CONFIG_BLOCK
> > > +static bool io_resubmit_prep(struct io_kiocb *req)
> > > +{
> > > +	struct io_async_rw *rw = req->async_data;
> > > +
> > > +	if (!req_has_async_data(req))
> > > +		return !io_req_prep_async(req);
> > > +	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
> > > +	return true;
> > > +}
> > > +
> > > +static bool io_rw_should_reissue(struct io_kiocb *req)
> > > +{
> > > +	umode_t mode = file_inode(req->file)->i_mode;
> > > +	struct io_ring_ctx *ctx = req->ctx;
> > > +
> > > +	if (!S_ISBLK(mode) && !S_ISREG(mode))
> > > +		return false;
> > > +	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
> > > +	    !(ctx->flags & IORING_SETUP_IOPOLL)))
> > > +		return false;
> > > +	/*
> > > +	 * If ref is dying, we might be running poll reap from the exit work.
> > > +	 * Don't attempt to reissue from that path, just let it fail with
> > > +	 * -EAGAIN.
> > > +	 */
> > > +	if (percpu_ref_is_dying(&ctx->refs))
> > > +		return false;
> > > +	/*
> > > +	 * Play it safe and assume not safe to re-import and reissue if we're
> > > +	 * not in the original thread group (or in task context).
> > > +	 */
> > > +	if (!same_thread_group(req->task, current) || !in_task())
> > > +		return false;
> > > +	return true;
> > > +}
> > > +#else
> > > +static bool io_resubmit_prep(struct io_kiocb *req)
> > > +{
> > > +	return false;
> > > +}
> > > +static bool io_rw_should_reissue(struct io_kiocb *req)
> > > +{
> > > +	return false;
> > > +}
> > > +#endif
> > > +
> > > +static void do_io_reissue(struct io_kiocb *req, int ret)
> > > +{
> > > +	if (req->flags & REQ_F_REISSUE) {
> > > +		req->flags &= ~REQ_F_REISSUE;
> > > +		if (io_resubmit_prep(req))
> > > +			io_req_task_queue_reissue(req);
> > > +		else
> > > +			io_req_task_queue_fail(req, ret);
> > > +	}
> > > +}
> > 
> > Minor nit but: I'd leave caller to check for REQ_F_REISSUE.
> > 
> > > +
> > > +
> > >  static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> > >  {
> > >  	struct io_wq_work_node *pos, *start, *prev;
> > > @@ -2786,6 +2845,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> > >  		if (READ_ONCE(req->iopoll_completed))
> > >  			break;
> > >  
> > > +		/*
> > > +		 * Once REISSUE flag is set, the req has been done, and we
> > > +		 * have to retry
> > > +		 */
> > > +		if (req->flags & REQ_F_REISSUE)
> > > +			break;
> > > +
> > >  		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
> > >  		if (unlikely(ret < 0))
> > >  			return ret;
> > > @@ -2807,6 +2873,12 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
> > >  	wq_list_for_each_resume(pos, prev) {
> > >  		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
> > >  
> > > +		if (req->flags & REQ_F_REISSUE) {
> > > +			wq_list_remove(&ctx->iopoll_list, prev, pos);
> > > +			do_io_reissue(req, -EIO);
> > > +			break;
> > > +		}
> > > +
> > 
> > That way you'll avoid redundant checks for REQ_F_REISSUE here.
> 
> Another do_io_reissue() needn't to remove req from ->iopoll_list, that
> is why the check is done here.

All do_io_reissue callers would need to first check for REQ_F_REISSUE,
but my point (purely about avoiding redundant checks) is moot if this
patch isn't safe.

Mike

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-03 11:45 [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns Ming Lei
  2022-04-04 16:51 ` Mike Snitzer
@ 2022-04-06  2:20 ` Jens Axboe
  2022-04-06  3:57   ` Ming Lei
  1 sibling, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2022-04-06  2:20 UTC (permalink / raw)
  To: Ming Lei, io-uring; +Cc: Mike Snitzer

On 4/3/22 5:45 AM, Ming Lei wrote:
> -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> io_iopoll_check doesn't handle this situation, and io hang can be caused.
> 
> Current dm io polling may return -EAGAIN after bio submission is
> returned, also blk-throttle might trigger this situation too.

I don't think this is necessarily safe. Handling REQ_F_ISSUE from within
the issue path is fine, as the request hasn't been submitted yet and
hence we know that passed in structs are still stable. Once you hit it
when polling for it, the io_uring_enter() call to submit requests has
potentially already returned, and now we're in a second call where we
are polling for requests. If we're doing eg an IORING_OP_READV, the
original iovec may no longer be valid and we cannot safely re-import
data associated with it.

Hence I don't think the patch is safe and we cannot reliably handle this
scenario. dm would need to retry internally for this.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-06  2:20 ` Jens Axboe
@ 2022-04-06  3:57   ` Ming Lei
  2022-04-06 12:58     ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Ming Lei @ 2022-04-06  3:57 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Mike Snitzer

On Tue, Apr 05, 2022 at 08:20:24PM -0600, Jens Axboe wrote:
> On 4/3/22 5:45 AM, Ming Lei wrote:
> > -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> > set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> > io_iopoll_check doesn't handle this situation, and io hang can be caused.
> > 
> > Current dm io polling may return -EAGAIN after bio submission is
> > returned, also blk-throttle might trigger this situation too.
> 
> I don't think this is necessarily safe. Handling REQ_F_ISSUE from within
> the issue path is fine, as the request hasn't been submitted yet and
> hence we know that passed in structs are still stable. Once you hit it
> when polling for it, the io_uring_enter() call to submit requests has
> potentially already returned, and now we're in a second call where we
> are polling for requests. If we're doing eg an IORING_OP_READV, the
> original iovec may no longer be valid and we cannot safely re-import
> data associated with it.

Yeah, this reissue is really not safe, thanks for the input.

I guess the only way is to complete the cqe for this situation.

> 
> Hence I don't think the patch is safe and we cannot reliably handle this
> scenario. dm would need to retry internally for this.

Another scenario might be bio throttle, which may delay submit_bio into kthrotld
wq context, where request allocation can return -EAGAIN too.

But I don't reproduce the problem in this scenario yet.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-06  3:57   ` Ming Lei
@ 2022-04-06 12:58     ` Jens Axboe
  2022-04-06 13:21       ` Ming Lei
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2022-04-06 12:58 UTC (permalink / raw)
  To: Ming Lei; +Cc: io-uring, Mike Snitzer

On 4/5/22 9:57 PM, Ming Lei wrote:
> On Tue, Apr 05, 2022 at 08:20:24PM -0600, Jens Axboe wrote:
>> On 4/3/22 5:45 AM, Ming Lei wrote:
>>> -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
>>> set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
>>> io_iopoll_check doesn't handle this situation, and io hang can be caused.
>>>
>>> Current dm io polling may return -EAGAIN after bio submission is
>>> returned, also blk-throttle might trigger this situation too.
>>
>> I don't think this is necessarily safe. Handling REQ_F_ISSUE from within
>> the issue path is fine, as the request hasn't been submitted yet and
>> hence we know that passed in structs are still stable. Once you hit it
>> when polling for it, the io_uring_enter() call to submit requests has
>> potentially already returned, and now we're in a second call where we
>> are polling for requests. If we're doing eg an IORING_OP_READV, the
>> original iovec may no longer be valid and we cannot safely re-import
>> data associated with it.
> 
> Yeah, this reissue is really not safe, thanks for the input.
> 
> I guess the only way is to complete the cqe for this situation.

At least if

io_op_defs[req->opcode].needs_async_setup

is true it isn't safe. But can't dm appropriately retry rather than
bubble up the -EAGAIN off ->iopoll?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-06 12:58     ` Jens Axboe
@ 2022-04-06 13:21       ` Ming Lei
  2022-04-06 16:38         ` Mike Snitzer
  0 siblings, 1 reply; 9+ messages in thread
From: Ming Lei @ 2022-04-06 13:21 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, Mike Snitzer

On Wed, Apr 06, 2022 at 06:58:28AM -0600, Jens Axboe wrote:
> On 4/5/22 9:57 PM, Ming Lei wrote:
> > On Tue, Apr 05, 2022 at 08:20:24PM -0600, Jens Axboe wrote:
> >> On 4/3/22 5:45 AM, Ming Lei wrote:
> >>> -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> >>> set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> >>> io_iopoll_check doesn't handle this situation, and io hang can be caused.
> >>>
> >>> Current dm io polling may return -EAGAIN after bio submission is
> >>> returned, also blk-throttle might trigger this situation too.
> >>
> >> I don't think this is necessarily safe. Handling REQ_F_ISSUE from within
> >> the issue path is fine, as the request hasn't been submitted yet and
> >> hence we know that passed in structs are still stable. Once you hit it
> >> when polling for it, the io_uring_enter() call to submit requests has
> >> potentially already returned, and now we're in a second call where we
> >> are polling for requests. If we're doing eg an IORING_OP_READV, the
> >> original iovec may no longer be valid and we cannot safely re-import
> >> data associated with it.
> > 
> > Yeah, this reissue is really not safe, thanks for the input.
> > 
> > I guess the only way is to complete the cqe for this situation.
> 
> At least if
> 
> io_op_defs[req->opcode].needs_async_setup
> 
> is true it isn't safe. But can't dm appropriately retry rather than
> bubble up the -EAGAIN off ->iopoll?

The thing is that not only DM has such issue.

NVMe multipath has the risk, and blk-throttle/blk-cgroup may run into such
situation too.

Any situation in which submit_bio() runs into async bio submission, the
issue may be triggered.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns
  2022-04-06 13:21       ` Ming Lei
@ 2022-04-06 16:38         ` Mike Snitzer
  0 siblings, 0 replies; 9+ messages in thread
From: Mike Snitzer @ 2022-04-06 16:38 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, io-uring, Mike Snitzer

On Wed, Apr 06 2022 at  9:21P -0400,
Ming Lei <[email protected]> wrote:

> On Wed, Apr 06, 2022 at 06:58:28AM -0600, Jens Axboe wrote:
> > On 4/5/22 9:57 PM, Ming Lei wrote:
> > > On Tue, Apr 05, 2022 at 08:20:24PM -0600, Jens Axboe wrote:
> > >> On 4/3/22 5:45 AM, Ming Lei wrote:
> > >>> -EAGAIN still may return after io issue returns, and REQ_F_REISSUE is
> > >>> set in io_complete_rw_iopoll(), but the req never gets chance to be handled.
> > >>> io_iopoll_check doesn't handle this situation, and io hang can be caused.
> > >>>
> > >>> Current dm io polling may return -EAGAIN after bio submission is
> > >>> returned, also blk-throttle might trigger this situation too.
> > >>
> > >> I don't think this is necessarily safe. Handling REQ_F_ISSUE from within
> > >> the issue path is fine, as the request hasn't been submitted yet and
> > >> hence we know that passed in structs are still stable. Once you hit it
> > >> when polling for it, the io_uring_enter() call to submit requests has
> > >> potentially already returned, and now we're in a second call where we
> > >> are polling for requests. If we're doing eg an IORING_OP_READV, the
> > >> original iovec may no longer be valid and we cannot safely re-import
> > >> data associated with it.
> > > 
> > > Yeah, this reissue is really not safe, thanks for the input.
> > > 
> > > I guess the only way is to complete the cqe for this situation.
> > 
> > At least if
> > 
> > io_op_defs[req->opcode].needs_async_setup
> > 
> > is true it isn't safe. But can't dm appropriately retry rather than
> > bubble up the -EAGAIN off ->iopoll?

The -EAGAIN is happening at submission, but this is bio-based so it is
felt past the point of ->submit_bio return.  The underlying
request-based driver has run out of tags and so the bio is errored by
block core. So it isn't felt until bio completion time.

In the case of DM, that stack trace looks like:

[168195.924803] RIP: 0010:dm_io_complete+0x1e0/0x1f0 [dm_mod]
<snip>
[168196.029002] Call Trace:
[168196.031543]  <TASK>
[168196.033737]  dm_poll_bio+0xd7/0x170 [dm_mod]
[168196.038106]  bio_poll+0xe3/0x110
[168196.041435]  iocb_bio_iopoll+0x34/0x50
[168196.045278]  io_do_iopoll+0xfb/0x400
[168196.048947]  io_iopoll_check+0x5d/0x140
[168196.052873]  __do_sys_io_uring_enter+0x3d9/0x440
[168196.057578]  do_syscall_64+0x3a/0x80
[168196.061246]  entry_SYSCALL_64_after_hwframe+0x44/0xae

But prior to that, the DM clone bio's ->bi_end_io (clone_endio) was
called with BLK_STS_AGAIN -- its just that by design dm's ->poll_bio
is what triggers final dm_io_complete() of the original polled bio (in
terms of polling process's context).

> The thing is that not only DM has such issue.
> 
> NVMe multipath has the risk, and blk-throttle/blk-cgroup may run into such
> situation too.
> 
> Any situation in which submit_bio() runs into async bio submission, the
> issue may be triggered.

bio-based is always async by virtue of bios getting packed into a
request kafter ->submit_bio returns. But to do so an available tag is
needed.

Mike


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-04-06 18:12 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-04-03 11:45 [RFC PATCH] io_uring: reissue in case -EAGAIN is returned after io issue returns Ming Lei
2022-04-04 16:51 ` Mike Snitzer
2022-04-06  2:09   ` Ming Lei
2022-04-06 16:52     ` Mike Snitzer
2022-04-06  2:20 ` Jens Axboe
2022-04-06  3:57   ` Ming Lei
2022-04-06 12:58     ` Jens Axboe
2022-04-06 13:21       ` Ming Lei
2022-04-06 16:38         ` Mike Snitzer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox