public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups
@ 2021-02-04 13:51 Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
                   ` (13 more replies)
  0 siblings, 14 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Bunch of random cleanups, noticeable part of which (4-9/13) refactor
io_read(), which is currently not in the best shape and is hard to
understand.

7/13 may actually fix a problem.
10/13 should make NOWAIT and NONBLOCK to work right

v2: add 9-13/13

Pavel Begunkov (13):
  io_uring: deduplicate core cancellations sequence
  io_uring: refactor scheduling in io_cqring_wait
  io_uring: refactor io_cqring_wait
  io_uring: refactor io_read for unsupported nowait
  io_uring: further simplify do_read error parsing
  io_uring: let io_setup_async_rw take care of iovec
  io_uring: don't forget to adjust io_size
  io_uring: inline io_read()'s iovec freeing
  io_uring: highlight read-retry loop
  io_uring: treat NONBLOCK and RWF_NOWAIT similarly
  io_uring: io_import_iovec return type cleanup
  io_uring: deduplicate file table slot calculation
  io_uring/io-wq: return 2-step work swap scheme

 fs/io-wq.c    |  16 +--
 fs/io-wq.h    |   4 +-
 fs/io_uring.c | 366 ++++++++++++++++++++++----------------------------
 3 files changed, 166 insertions(+), 220 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Files and task cancellations go over same steps trying to cancel
requests in io-wq, poll, etc. Deduplicate it with a helper.

note: new io_uring_try_cancel_requests() is former
__io_uring_cancel_task_requests() with files passed as an agrument and
flushing overflowed requests.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 85 ++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24ad36d71289..a750c504366d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task);
-
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files);
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx);
@@ -8817,7 +8817,7 @@ static void io_ring_exit_work(struct work_struct *work)
 	 * as nobody else will be looking for them.
 	 */
 	do {
-		__io_uring_cancel_task_requests(ctx, NULL);
+		io_uring_try_cancel_requests(ctx, NULL, NULL);
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 	io_ring_ctx_free(ctx);
 }
@@ -8931,6 +8931,40 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
 	}
 }
 
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files)
+{
+	struct io_task_cancel cancel = { .task = task, .files = files, };
+
+	while (1) {
+		enum io_wq_cancel cret;
+		bool ret = false;
+
+		if (ctx->io_wq) {
+			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+					       &cancel, true);
+			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+		}
+
+		/* SQPOLL thread does its own polling */
+		if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
+			while (!list_empty_careful(&ctx->iopoll_list)) {
+				io_iopoll_try_reap_events(ctx);
+				ret = true;
+			}
+		}
+
+		ret |= io_poll_remove_all(ctx, task, files);
+		ret |= io_kill_timeouts(ctx, task, files);
+		ret |= io_run_task_work();
+		io_cqring_overflow_flush(ctx, true, task, files);
+		if (!ret)
+			break;
+		cond_resched();
+	}
+}
+
 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
 				   struct task_struct *task,
 				   struct files_struct *files)
@@ -8950,7 +8984,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				  struct files_struct *files)
 {
 	while (!list_empty_careful(&ctx->inflight_list)) {
-		struct io_task_cancel cancel = { .task = task, .files = files };
 		DEFINE_WAIT(wait);
 		int inflight;
 
@@ -8958,13 +8991,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 		if (!inflight)
 			break;
 
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
-		io_poll_remove_all(ctx, task, files);
-		io_kill_timeouts(ctx, task, files);
-		io_cqring_overflow_flush(ctx, true, task, files);
-		/* cancellations _may_ trigger task work */
-		io_run_task_work();
-
+		io_uring_try_cancel_requests(ctx, task, files);
 		prepare_to_wait(&task->io_uring->wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (inflight == io_uring_count_inflight(ctx, task, files))
@@ -8973,37 +9000,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task)
-{
-	while (1) {
-		struct io_task_cancel cancel = { .task = task, .files = NULL, };
-		enum io_wq_cancel cret;
-		bool ret = false;
-
-		if (ctx->io_wq) {
-			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
-					       &cancel, true);
-			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
-		}
-
-		/* SQPOLL thread does its own polling */
-		if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
-			while (!list_empty_careful(&ctx->iopoll_list)) {
-				io_iopoll_try_reap_events(ctx);
-				ret = true;
-			}
-		}
-
-		ret |= io_poll_remove_all(ctx, task, NULL);
-		ret |= io_kill_timeouts(ctx, task, NULL);
-		ret |= io_run_task_work();
-		if (!ret)
-			break;
-		cond_resched();
-	}
-}
-
 static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -9033,11 +9029,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 	}
 
 	io_cancel_defer_files(ctx, task, files);
-	io_cqring_overflow_flush(ctx, true, task, files);
 
 	io_uring_cancel_files(ctx, task, files);
 	if (!files)
-		__io_uring_cancel_task_requests(ctx, task);
+		io_uring_try_cancel_requests(ctx, task, NULL);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
 		atomic_dec(&task->io_uring->in_idle);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 03/13] io_uring: refactor io_cqring_wait Pavel Begunkov
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

schedule_timeout() with timeout=MAX_SCHEDULE_TIMEOUT is guaranteed to
work just as schedule(), so instead of hand-coding it based on arguments
always use the timeout version and simplify code.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a750c504366d..5b735635b8f0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7213,9 +7213,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
-	struct timespec64 ts;
-	signed long timeout = 0;
-	int ret = 0;
+	signed long timeout = MAX_SCHEDULE_TIMEOUT;
+	int ret;
 
 	do {
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -7239,6 +7238,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	}
 
 	if (uts) {
+		struct timespec64 ts;
+
 		if (get_timespec64(&ts, uts))
 			return -EFAULT;
 		timeout = timespec64_to_jiffies(&ts);
@@ -7264,14 +7265,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			finish_wait(&ctx->wait, &iowq.wq);
 			continue;
 		}
-		if (uts) {
-			timeout = schedule_timeout(timeout);
-			if (timeout == 0) {
-				ret = -ETIME;
-				break;
-			}
-		} else {
-			schedule();
+		timeout = schedule_timeout(timeout);
+		if (timeout == 0) {
+			ret = -ETIME;
+			break;
 		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 03/13] io_uring: refactor io_cqring_wait
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:51 ` [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

It's easy to make a mistake in io_cqring_wait() because for all
break/continue clauses we need to watch for prepare/finish_wait to be
used correctly. Extract all those into a new helper
io_cqring_wait_schedule(), and transforming the loop into simple series
of func calls: prepare(); check_and_schedule(); finish();

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5b735635b8f0..dcb9e937daa3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7195,6 +7195,25 @@ static int io_run_task_work_sig(void)
 	return -EINTR;
 }
 
+/* when returns >0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+					  struct io_wait_queue *iowq,
+					  signed long *timeout)
+{
+	int ret;
+
+	/* make sure we run task_work before checking for signals */
+	ret = io_run_task_work_sig();
+	if (ret || io_should_wake(iowq))
+		return ret;
+	/* let the caller flush overflows, retry */
+	if (test_bit(0, &ctx->cq_check_overflow))
+		return 1;
+
+	*timeout = schedule_timeout(*timeout);
+	return !*timeout ? -ETIME : 1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -7251,27 +7270,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		/* make sure we run task_work before checking for signals */
-		ret = io_run_task_work_sig();
-		if (ret > 0) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		else if (ret < 0)
-			break;
-		if (io_should_wake(&iowq))
-			break;
-		if (test_bit(0, &ctx->cq_check_overflow)) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		timeout = schedule_timeout(timeout);
-		if (timeout == 0) {
-			ret = -ETIME;
-			break;
-		}
-	} while (1);
-	finish_wait(&ctx->wait, &iowq.wq);
+		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+		finish_wait(&ctx->wait, &iowq.wq);
+	} while (ret > 0);
 
 	restore_saved_sigmask_unless(ret == -EINTR);
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-02-04 13:51 ` [PATCH v2 03/13] io_uring: refactor io_cqring_wait Pavel Begunkov
@ 2021-02-04 13:51 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 05/13] io_uring: further simplify do_read error parsing Pavel Begunkov
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

!io_file_supports_async() case of io_read() is hard to read, it jumps
somewhere in the middle of the function just to do async setup and fail
on a similar check. Call io_setup_async_rw() directly for this case,
it's much easier to follow.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dcb9e937daa3..866e0ea83dbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3506,7 +3506,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct iov_iter __iter, *iter = &__iter;
 	struct io_async_rw *rw = req->async_data;
 	ssize_t io_size, ret, ret2;
-	bool no_async;
 
 	if (rw) {
 		iter = &rw->iter;
@@ -3527,9 +3526,12 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	no_async = force_nonblock && !io_file_supports_async(req->file, READ);
-	if (no_async)
-		goto copy_iov;
+	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+		if (!ret)
+			return -EAGAIN;
+		goto out_free;
+	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
 	if (unlikely(ret))
@@ -3568,8 +3570,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		ret = ret2;
 		goto out_free;
 	}
-	if (no_async)
-		return -EAGAIN;
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 05/13] io_uring: further simplify do_read error parsing
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-02-04 13:51 ` [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

First, instead of checking iov_iter_count(iter) for 0 to find out that
all needed bytes were read, just compare returned code against io_size.
It's more reliable and arguably cleaner.

Also, place the half-read case into an else branch and delete an extra
label.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 866e0ea83dbe..1d1fa1f77332 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3552,19 +3552,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-		goto copy_iov;
-	} else if (ret <= 0) {
+	} else if (ret <= 0 || ret == io_size) {
 		/* make sure -ERESTARTSYS -> -EINTR is done */
 		goto done;
-	}
+	} else {
+		/* we did blocking attempt. no retry. */
+		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
+		    !(req->flags & REQ_F_ISREG))
+			goto done;
 
-	/* read it all, or we did blocking attempt. no retry. */
-	if (!iov_iter_count(iter) || !force_nonblock ||
-	    (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG))
-		goto done;
+		io_size -= ret;
+	}
 
-	io_size -= ret;
-copy_iov:
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 	if (ret2) {
 		ret = ret2;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 05/13] io_uring: further simplify do_read error parsing Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 07/13] io_uring: don't forget to adjust io_size Pavel Begunkov
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Now we give out ownership of iovec into io_setup_async_rw(), so it
either sets request's context right or frees the iovec on error itself.
Makes our life a bit easier at call sites.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1d1fa1f77332..f8492d62b6a1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2721,11 +2721,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		return false;
-	ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
-	if (!ret)
-		return true;
-	kfree(iovec);
-	return false;
+	return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
 }
 #endif
 
@@ -3366,8 +3362,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 	if (!force && !io_op_defs[req->opcode].needs_async_data)
 		return 0;
 	if (!req->async_data) {
-		if (__io_alloc_async_data(req))
+		if (__io_alloc_async_data(req)) {
+			kfree(iovec);
 			return -ENOMEM;
+		}
 
 		io_req_map_rw(req, iovec, fast_iov, iter);
 	}
@@ -3528,9 +3526,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-		if (!ret)
-			return -EAGAIN;
-		goto out_free;
+		return ret ?: -EAGAIN;
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
@@ -3565,10 +3561,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-	if (ret2) {
-		ret = ret2;
-		goto out_free;
-	}
+	if (ret2)
+		return ret2;
+
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
@@ -3703,8 +3698,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
-		if (!ret)
-			return -EAGAIN;
+		return ret ?: -EAGAIN;
 	}
 out_free:
 	/* it's reportedly faster than delegating the null check to kfree() */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 07/13] io_uring: don't forget to adjust io_size
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (5 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We have invariant in io_read() of how much we're trying to read spilled
into an iter and io_size variable. The last one controls decision making
about whether to do read-retries. However, io_size is modified only
after the first read attempt, so if we happen to go for a third retry in
a single call to io_read(), we will get io_size greater than in the
iterator, so may lead to various side effects up to live-locking.

Modify io_size each time.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8492d62b6a1..25fffff27c76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3548,16 +3548,11 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-	} else if (ret <= 0 || ret == io_size) {
-		/* make sure -ERESTARTSYS -> -EINTR is done */
+	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
+		   (req->file->f_flags & O_NONBLOCK) ||
+		   !(req->flags & REQ_F_ISREG)) {
+		/* read all, failed, already did sync or don't want to retry */
 		goto done;
-	} else {
-		/* we did blocking attempt. no retry. */
-		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
-		    !(req->flags & REQ_F_ISREG))
-			goto done;
-
-		io_size -= ret;
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
@@ -3570,6 +3565,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
+	io_size -= ret;
 	rw->bytes_done += ret;
 	/* if we can retry, do so with the callbacks armed */
 	if (!io_rw_should_retry(req)) {
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (6 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 07/13] io_uring: don't forget to adjust io_size Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 09/13] io_uring: highlight read-retry loop Pavel Begunkov
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_read() has not the simpliest control flow with a lot of jumps and
it's hard to read. One of those is a out_free: label, which frees iovec.
However, from the middle of io_read() iovec is NULL'ed and so
kfree(iovec) is no-op, it leaves us with two place where we can inline
it and further clean up the code.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 25fffff27c76..35ad889afaec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3530,14 +3530,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
-	if (unlikely(ret))
-		goto out_free;
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
 
 	ret = io_iter_do_read(req, iter);
 
 	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
+		/* it's faster to check here then delegate to kfree */
+		if (iovec)
+			kfree(iovec);
+		return 0;
 	} else if (ret == -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3560,8 +3564,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		return ret2;
 
 	rw = req->async_data;
-	/* it's copied and will be cleaned with ->io */
-	iovec = NULL;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
@@ -3580,21 +3582,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	 * do, then just retry at the new offset.
 	 */
 	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
-	} else if (ret > 0 && ret < io_size) {
-		/* we got some bytes, but not all. retry. */
+	if (ret == -EIOCBQUEUED)
+		return 0;
+	/* we got some bytes, but not all. retry. */
+	if (ret > 0 && ret < io_size)
 		goto retry;
-	}
 done:
 	kiocb_done(kiocb, ret, cs);
-	ret = 0;
-out_free:
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
-	return ret;
+	return 0;
 }
 
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 09/13] io_uring: highlight read-retry loop
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (7 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly Pavel Begunkov
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We already have implicit do-while for read-retries but with goto in the
end. Convert it to an actual do-while, it highlights it so making a
bit more understandable and is cleaner in general.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 35ad889afaec..bbf8ea8370d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3566,27 +3566,27 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	rw = req->async_data;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
-retry:
-	io_size -= ret;
-	rw->bytes_done += ret;
-	/* if we can retry, do so with the callbacks armed */
-	if (!io_rw_should_retry(req)) {
-		kiocb->ki_flags &= ~IOCB_WAITQ;
-		return -EAGAIN;
-	}
 
-	/*
-	 * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
-	 * get -EIOCBQUEUED, then we'll get a notification when the desired
-	 * page gets unlocked. We can also get a partial read here, and if we
-	 * do, then just retry at the new offset.
-	 */
-	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED)
-		return 0;
-	/* we got some bytes, but not all. retry. */
-	if (ret > 0 && ret < io_size)
-		goto retry;
+	do {
+		io_size -= ret;
+		rw->bytes_done += ret;
+		/* if we can retry, do so with the callbacks armed */
+		if (!io_rw_should_retry(req)) {
+			kiocb->ki_flags &= ~IOCB_WAITQ;
+			return -EAGAIN;
+		}
+
+		/*
+		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
+		 * we get -EIOCBQUEUED, then we'll get a notification when the
+		 * desired page gets unlocked. We can also get a partial read
+		 * here, and if we do, then just retry at the new offset.
+		 */
+		ret = io_iter_do_read(req, iter);
+		if (ret == -EIOCBQUEUED)
+			return 0;
+		/* we got some bytes, but not all. retry. */
+	} while (ret > 0 && ret < io_size);
 done:
 	kiocb_done(kiocb, ret, cs);
 	return 0;
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (8 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 09/13] io_uring: highlight read-retry loop Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup Pavel Begunkov
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Make decision making of whether we need to retry read/write similar for
O_NONBLOCK and RWF_NOWAIT. Set REQ_F_NOWAIT when either is specified and
use it for all relevant checks. Also fix resubmitting NOWAIT requests
via io_rw_reissue().

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bbf8ea8370d6..ce2ea3f55f65 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2734,7 +2734,9 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 	if (res != -EAGAIN && res != -EOPNOTSUPP)
 		return false;
 	mode = file_inode(req->file)->i_mode;
-	if ((!S_ISBLK(mode) && !S_ISREG(mode)) || io_wq_current_is_worker())
+	if (!S_ISBLK(mode) && !S_ISREG(mode))
+		return false;
+	if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
 		return false;
 
 	lockdep_assert_held(&req->ctx->uring_lock);
@@ -2907,16 +2909,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw.kiocb;
+	struct file *file = req->file;
 	unsigned ioprio;
 	int ret;
 
-	if (S_ISREG(file_inode(req->file)->i_mode))
+	if (S_ISREG(file_inode(file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
-	if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
 		req->flags |= REQ_F_CUR_POS;
-		kiocb->ki_pos = req->file->f_pos;
+		kiocb->ki_pos = file->f_pos;
 	}
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
@@ -2924,6 +2927,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(ret))
 		return ret;
 
+	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+		req->flags |= REQ_F_NOWAIT;
+
 	ioprio = READ_ONCE(sqe->ioprio);
 	if (ioprio) {
 		ret = ioprio_check_cap(ioprio);
@@ -2934,10 +2941,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
 
-	/* don't allow async punt if RWF_NOWAIT was requested */
-	if (kiocb->ki_flags & IOCB_NOWAIT)
-		req->flags |= REQ_F_NOWAIT;
-
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 		    !kiocb->ki_filp->f_op->iopoll)
@@ -3546,15 +3549,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 			goto done;
-		/* no retry on NONBLOCK marked file */
-		if (req->file->f_flags & O_NONBLOCK)
+		/* no retry on NONBLOCK nor RWF_NOWAIT */
+		if (req->flags & REQ_F_NOWAIT)
 			goto done;
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
 	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
-		   (req->file->f_flags & O_NONBLOCK) ||
-		   !(req->flags & REQ_F_ISREG)) {
+		   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
 		/* read all, failed, already did sync or don't want to retry */
 		goto done;
 	}
@@ -3675,8 +3677,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	 */
 	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 		ret2 = -EAGAIN;
-	/* no retry on NONBLOCK marked file */
-	if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
+	/* no retry on NONBLOCK nor RWF_NOWAIT */
+	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
 		goto done;
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (9 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 12/13] io_uring: deduplicate file table slot calculation Pavel Begunkov
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_import_iovec() doesn't return IO size anymore, only error code. Make
it more apparent by returning int instead of ssize and clean up
leftovers.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce2ea3f55f65..24cc00ff7155 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1030,9 +1030,8 @@ static struct file *io_file_get(struct io_submit_state *state,
 static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
 static void io_rsrc_put_work(struct work_struct *work);
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-			       struct iovec **iovec, struct iov_iter *iter,
-			       bool needs_lock);
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+			   struct iov_iter *iter, bool needs_lock);
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force);
@@ -2693,9 +2692,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
-	ssize_t ret = -ECANCELED;
+	int rw, ret = -ECANCELED;
 	struct iov_iter iter;
-	int rw;
 
 	/* already prepared */
 	if (req->async_data)
@@ -3004,8 +3002,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 		io_rw_done(kiocb, ret);
 }
 
-static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
-			       struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	size_t len = req->rw.len;
@@ -3069,7 +3066,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
 		}
 	}
 
-	return len;
+	return 0;
 }
 
 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
@@ -3210,16 +3207,14 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	return __io_iov_buffer_select(req, iov, needs_lock);
 }
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-				 struct iovec **iovec, struct iov_iter *iter,
-				 bool needs_lock)
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+			   struct iov_iter *iter, bool needs_lock)
 {
 	void __user *buf = u64_to_user_ptr(req->rw.addr);
 	size_t sqe_len = req->rw.len;
+	u8 opcode = req->opcode;
 	ssize_t ret;
-	u8 opcode;
 
-	opcode = req->opcode;
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 		*iovec = NULL;
 		return io_import_fixed(req, rw, iter);
@@ -3244,10 +3239,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
-		if (!ret) {
-			ret = (*iovec)->iov_len;
-			iov_iter_init(iter, rw, *iovec, 1, ret);
-		}
+		if (!ret)
+			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
 		*iovec = NULL;
 		return ret;
 	}
@@ -3379,7 +3372,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 {
 	struct io_async_rw *iorw = req->async_data;
 	struct iovec *iov = iorw->fast_iov;
-	ssize_t ret;
+	int ret;
 
 	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
 	if (unlikely(ret < 0))
@@ -3518,7 +3511,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 	io_size = iov_iter_count(iter);
 	req->result = io_size;
-	ret = 0;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 12/13] io_uring: deduplicate file table slot calculation
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (10 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
  2021-02-04 15:07 ` [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Jens Axboe
  13 siblings, 0 replies; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Extract a helper io_fixed_file_slot() returning a place in our fixed
files table, so we don't hand-code it three times in the code.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24cc00ff7155..5ee6a9273fca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7740,6 +7740,15 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
+static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
+					unsigned i)
+{
+	struct fixed_rsrc_table *table;
+
+	table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+	return &table->files[i & IORING_FILE_TABLE_MASK];
+}
+
 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
 	struct fixed_rsrc_ref_node *ref_node;
@@ -7808,6 +7817,7 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
 	kfree(ref_node);
 }
 
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
@@ -7840,9 +7850,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct fixed_rsrc_table *table;
-		unsigned index;
-
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
@@ -7867,9 +7874,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		index = i & IORING_FILE_TABLE_MASK;
-		table->files[index] = file;
+		*io_fixed_file_slot(file_data, i) = file;
 	}
 
 	ret = io_sqe_files_scm(ctx);
@@ -7972,7 +7977,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 {
 	struct fixed_rsrc_data *data = ctx->file_data;
 	struct fixed_rsrc_ref_node *ref_node;
-	struct file *file;
+	struct file *file, **file_slot;
 	__s32 __user *fds;
 	int fd, i, err;
 	__u32 done;
@@ -7990,9 +7995,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	fds = u64_to_user_ptr(up->data);
 	for (done = 0; done < nr_args; done++) {
-		struct fixed_rsrc_table *table;
-		unsigned index;
-
 		err = 0;
 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
@@ -8002,14 +8004,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		index = i & IORING_FILE_TABLE_MASK;
-		if (table->files[index]) {
-			file = table->files[index];
-			err = io_queue_file_removal(data, file);
+		file_slot = io_fixed_file_slot(ctx->file_data, i);
+
+		if (*file_slot) {
+			err = io_queue_file_removal(data, *file_slot);
 			if (err)
 				break;
-			table->files[index] = NULL;
+			*file_slot = NULL;
 			needs_switch = true;
 		}
 		if (fd != -1) {
@@ -8031,13 +8032,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			table->files[index] = file;
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
-				table->files[index] = NULL;
 				fput(file);
 				break;
 			}
+			*file_slot = file;
 		}
 	}
 
@@ -9488,11 +9488,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-		struct fixed_rsrc_table *table;
-		struct file *f;
+		struct file *f = *io_fixed_file_slot(ctx->file_data, i);
 
-		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		f = table->files[i & IORING_FILE_TABLE_MASK];
 		if (f)
 			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
 		else
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (11 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 12/13] io_uring: deduplicate file table slot calculation Pavel Begunkov
@ 2021-02-04 13:52 ` Pavel Begunkov
  2021-02-04 14:52   ` Jens Axboe
  2021-02-04 15:07 ` [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Jens Axboe
  13 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Saving one lock/unlock for io-wq is not super important, but adds some
ugliness in the code. More important, atomic decs not turning it to zero
for some archs won't give the right ordering/barriers so the
io_steal_work() may pretty easily get subtly and completely broken.

Return back 2-step io-wq work exchange and clean it up.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io-wq.c    | 16 ++++++----------
 fs/io-wq.h    |  4 ++--
 fs/io_uring.c | 26 ++++----------------------
 3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2e2f14f42bf2..63ef195b1acb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -555,23 +555,21 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 		/* handle a whole dependent link */
 		do {
-			struct io_wq_work *old_work, *next_hashed, *linked;
+			struct io_wq_work *next_hashed, *linked;
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
 			io_impersonate_work(worker, work);
+			wq->do_work(work);
+			io_assign_current_work(worker, NULL);
 
-			old_work = work;
-			linked = wq->do_work(work);
-
+			linked = wq->free_work(work);
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
 				work = linked;
 				linked = NULL;
 			}
 			io_assign_current_work(worker, work);
-			wq->free_work(old_work);
-
 			if (linked)
 				io_wqe_enqueue(wqe, linked);
 
@@ -850,11 +848,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 	struct io_wq *wq = wqe->wq;
 
 	do {
-		struct io_wq_work *old_work = work;
-
 		work->flags |= IO_WQ_WORK_CANCEL;
-		work = wq->do_work(work);
-		wq->free_work(old_work);
+		wq->do_work(work);
+		work = wq->free_work(work);
 	} while (work);
 }
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index e1ffb80a4a1d..e37a0f217cc8 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -106,8 +106,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 	return container_of(work->list.next, struct io_wq_work, list);
 }
 
-typedef void (free_work_fn)(struct io_wq_work *);
-typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work *);
 
 struct io_wq_data {
 	struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ee6a9273fca..b740a39110d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2379,22 +2379,6 @@ static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 		io_free_req_deferred(req);
 }
 
-static struct io_wq_work *io_steal_work(struct io_kiocb *req)
-{
-	struct io_kiocb *nxt;
-
-	/*
-	 * A ref is owned by io-wq in which context we're. So, if that's the
-	 * last one, it's safe to steal next work. False negatives are Ok,
-	 * it just will be re-punted async in io_put_work()
-	 */
-	if (refcount_read(&req->refs) != 1)
-		return NULL;
-
-	nxt = io_req_find_next(req);
-	return nxt ? &nxt->work : NULL;
-}
-
 static void io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
@@ -6343,7 +6327,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	return 0;
 }
 
-static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
+static void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_kiocb *timeout;
@@ -6394,8 +6378,6 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 		if (lock_ctx)
 			mutex_unlock(&lock_ctx->uring_lock);
 	}
-
-	return io_steal_work(req);
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -8067,12 +8049,12 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 	return __io_sqe_files_update(ctx, &up, nr_args);
 }
 
-static void io_free_work(struct io_wq_work *work)
+static struct io_wq_work *io_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
-	/* Consider that io_steal_work() relies on this ref */
-	io_put_req(req);
+	req = io_put_req_find_next(req);
+	return req ? &req->work : NULL;
 }
 
 static int io_init_wq_offload(struct io_ring_ctx *ctx,
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
@ 2021-02-04 14:52   ` Jens Axboe
  2021-02-04 14:56     ` Pavel Begunkov
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-02-04 14:52 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/4/21 6:52 AM, Pavel Begunkov wrote:
> Saving one lock/unlock for io-wq is not super important, but adds some
> ugliness in the code. More important, atomic decs not turning it to zero
> for some archs won't give the right ordering/barriers so the
> io_steal_work() may pretty easily get subtly and completely broken.
> 
> Return back 2-step io-wq work exchange and clean it up.

IIRC, this wasn't done to skip the lock/unlock exchange, which I agree
doesn't matter, but to ensure that a link would not need another io-wq
punt. And that is a big deal, it's much faster to run it from that
same thread, rather than needing a new async queue and new thread grab
to get there.

Just want to make sure that's on your mind... Maybe it's still fine
as-is, didn't look too closely yet or test it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 14:52   ` Jens Axboe
@ 2021-02-04 14:56     ` Pavel Begunkov
  2021-02-04 15:05       ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Pavel Begunkov @ 2021-02-04 14:56 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 04/02/2021 14:52, Jens Axboe wrote:
> On 2/4/21 6:52 AM, Pavel Begunkov wrote:
>> Saving one lock/unlock for io-wq is not super important, but adds some
>> ugliness in the code. More important, atomic decs not turning it to zero
>> for some archs won't give the right ordering/barriers so the
>> io_steal_work() may pretty easily get subtly and completely broken.
>>
>> Return back 2-step io-wq work exchange and clean it up.
> 
> IIRC, this wasn't done to skip the lock/unlock exchange, which I agree
> doesn't matter, but to ensure that a link would not need another io-wq
> punt. And that is a big deal, it's much faster to run it from that
> same thread, rather than needing a new async queue and new thread grab
> to get there.

Right, we just refer to different patches and moments. This one is fine
in that regard, it just moves returning link from ->do_work() to
->free_work().

> 
> Just want to make sure that's on your mind... Maybe it's still fine
> as-is, didn't look too closely yet or test it.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme
  2021-02-04 14:56     ` Pavel Begunkov
@ 2021-02-04 15:05       ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-02-04 15:05 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/4/21 7:56 AM, Pavel Begunkov wrote:
> On 04/02/2021 14:52, Jens Axboe wrote:
>> On 2/4/21 6:52 AM, Pavel Begunkov wrote:
>>> Saving one lock/unlock for io-wq is not super important, but adds some
>>> ugliness in the code. More important, atomic decs not turning it to zero
>>> for some archs won't give the right ordering/barriers so the
>>> io_steal_work() may pretty easily get subtly and completely broken.
>>>
>>> Return back 2-step io-wq work exchange and clean it up.
>>
>> IIRC, this wasn't done to skip the lock/unlock exchange, which I agree
>> doesn't matter, but to ensure that a link would not need another io-wq
>> punt. And that is a big deal, it's much faster to run it from that
>> same thread, rather than needing a new async queue and new thread grab
>> to get there.
> 
> Right, we just refer to different patches and moments. This one is fine
> in that regard, it just moves returning link from ->do_work() to
> ->free_work().

Got it, looks good then.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups
  2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
                   ` (12 preceding siblings ...)
  2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
@ 2021-02-04 15:07 ` Jens Axboe
  13 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-02-04 15:07 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 2/4/21 6:51 AM, Pavel Begunkov wrote:
> Bunch of random cleanups, noticeable part of which (4-9/13) refactor
> io_read(), which is currently not in the best shape and is hard to
> understand.
> 
> 7/13 may actually fix a problem.
> 10/13 should make NOWAIT and NONBLOCK to work right
> 
> v2: add 9-13/13
> 
> Pavel Begunkov (13):
>   io_uring: deduplicate core cancellations sequence
>   io_uring: refactor scheduling in io_cqring_wait
>   io_uring: refactor io_cqring_wait
>   io_uring: refactor io_read for unsupported nowait
>   io_uring: further simplify do_read error parsing
>   io_uring: let io_setup_async_rw take care of iovec
>   io_uring: don't forget to adjust io_size
>   io_uring: inline io_read()'s iovec freeing
>   io_uring: highlight read-retry loop
>   io_uring: treat NONBLOCK and RWF_NOWAIT similarly
>   io_uring: io_import_iovec return type cleanup
>   io_uring: deduplicate file table slot calculation
>   io_uring/io-wq: return 2-step work swap scheme
> 
>  fs/io-wq.c    |  16 +--
>  fs/io-wq.h    |   4 +-
>  fs/io_uring.c | 366 ++++++++++++++++++++++----------------------------
>  3 files changed, 166 insertions(+), 220 deletions(-)

Applied, thanks Pavel!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-02-04 16:27 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-02-04 13:51 [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 01/13] io_uring: deduplicate core cancellations sequence Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 02/13] io_uring: refactor scheduling in io_cqring_wait Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 03/13] io_uring: refactor io_cqring_wait Pavel Begunkov
2021-02-04 13:51 ` [PATCH v2 04/13] io_uring: refactor io_read for unsupported nowait Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 05/13] io_uring: further simplify do_read error parsing Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 06/13] io_uring: let io_setup_async_rw take care of iovec Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 07/13] io_uring: don't forget to adjust io_size Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 08/13] io_uring: inline io_read()'s iovec freeing Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 09/13] io_uring: highlight read-retry loop Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 10/13] io_uring: treat NONBLOCK and RWF_NOWAIT similarly Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 11/13] io_uring: io_import_iovec return type cleanup Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 12/13] io_uring: deduplicate file table slot calculation Pavel Begunkov
2021-02-04 13:52 ` [PATCH v2 13/13] io_uring/io-wq: return 2-step work swap scheme Pavel Begunkov
2021-02-04 14:52   ` Jens Axboe
2021-02-04 14:56     ` Pavel Begunkov
2021-02-04 15:05       ` Jens Axboe
2021-02-04 15:07 ` [PATCH v2 5.12 00/13] a second pack of 5.12 cleanups Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox