public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v2 00/28] for-next patches
@ 2021-08-09 12:04 Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 01/28] io_uring: use kvmalloc for fixed files Pavel Begunkov
                   ` (28 more replies)
  0 siblings, 29 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

1,2 -- optimisation with kvmalloc()'ing file tables
4 -- optimise prep_rw() still touching inode with !REG fixed files
11 -- a small CQ waiting optimisation
20 -- put_task optimisation, saves atomics in many cases
23 -- helps req alloc sustainability, also needed for futures features

All others are cleanups, where 6-28 are resends.

v2: added patches 1-5

Pavel Begunkov (28):
  io_uring: use kvmalloc for fixed files
  io_uring: inline fixed part of io_file_get()
  io_uring: rename io_file_supports_async()
  io_uring: avoid touching inode in rw prep
  io_uring: clean io-wq callbacks
  io_uring: remove unnecessary PF_EXITING check
  io-wq: improve wq_list_add_tail()
  io_uring: refactor io_alloc_req
  io_uring: don't halt iopoll too early
  io_uring: add more locking annotations for submit
  io_uring: optimise io_cqring_wait() hot path
  io_uring: extract a helper for ctx quiesce
  io_uring: move io_put_task() definition
  io_uring: move io_rsrc_node_alloc() definition
  io_uring: inline io_free_req_deferred
  io_uring: deduplicate open iopoll check
  io_uring: improve ctx hang handling
  io_uring: kill unused IO_IOPOLL_BATCH
  io_uring: drop exec checks from io_req_task_submit
  io_uring: optimise putting task struct
  io_uring: hide async dadta behind flags
  io_uring: move io_fallback_req_func()
  io_uring: cache __io_free_req()'d requests
  io_uring: remove redundant args from cache_free
  io_uring: use inflight_entry instead of compl.list
  io_uring: inline struct io_comp_state
  io_uring: remove extra argument for overflow flush
  io_uring: inline io_poll_remove_waitqs

 fs/io-wq.h    |   2 +-
 fs/io_uring.c | 700 +++++++++++++++++++++++++-------------------------
 2 files changed, 355 insertions(+), 347 deletions(-)

-- 
2.32.0


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 01/28] io_uring: use kvmalloc for fixed files
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 02/28] io_uring: inline fixed part of io_file_get() Pavel Begunkov
                   ` (27 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Instead of hand-coded two-level tables for registered files, allocate
them with kvmalloc(). In many cases small enough tables are enough, and
so can be kmalloc()'ed removing an extra memory load and a bunch of bit
logic instructions from the hot path. If the table is larger, we trade
off all the pros with a TLB-assisted memory lookup.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ebf467e0cb0f..5072f84ef99f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -91,13 +91,8 @@
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
 
-/*
- * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
- */
-#define IORING_FILE_TABLE_SHIFT	9
-#define IORING_MAX_FILES_TABLE	(1U << IORING_FILE_TABLE_SHIFT)
-#define IORING_FILE_TABLE_MASK	(IORING_MAX_FILES_TABLE - 1)
-#define IORING_MAX_FIXED_FILES	(64 * IORING_MAX_FILES_TABLE)
+/* 512 entries per page on 64-bit archs, 64 pages max */
+#define IORING_MAX_FIXED_FILES	(1U << 15)
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 
@@ -234,8 +229,7 @@ struct io_rsrc_put {
 };
 
 struct io_file_table {
-	/* two level table */
-	struct io_fixed_file **files;
+	struct io_fixed_file *files;
 };
 
 struct io_rsrc_node {
@@ -6334,12 +6328,9 @@ static void io_wq_submit_work(struct io_wq_work *work)
 #define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
 
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
-						      unsigned i)
+						       unsigned i)
 {
-	struct io_fixed_file *table_l2;
-
-	table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
-	return &table_l2[i & IORING_FILE_TABLE_MASK];
+	return &table->files[i];
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -7283,17 +7274,13 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 
 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	size_t size = nr_files * sizeof(struct io_fixed_file);
-
-	table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+	table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
 	return !!table->files;
 }
 
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_file_tables(struct io_file_table *table)
 {
-	size_t size = nr_files * sizeof(struct io_fixed_file);
-
-	io_free_page_table((void **)table->files, size);
+	kvfree(table->files);
 	table->files = NULL;
 }
 
@@ -7318,7 +7305,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 			fput(file);
 	}
 #endif
-	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+	io_free_file_tables(&ctx->file_table);
 	io_rsrc_data_free(ctx->file_data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
@@ -7785,7 +7772,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		if (file)
 			fput(file);
 	}
-	io_free_file_tables(&ctx->file_table, nr_args);
+	io_free_file_tables(&ctx->file_table);
 	ctx->nr_user_files = 0;
 out_free:
 	io_rsrc_data_free(ctx->file_data);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 02/28] io_uring: inline fixed part of io_file_get()
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 01/28] io_uring: use kvmalloc for fixed files Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 03/28] io_uring: rename io_file_supports_async() Pavel Begunkov
                   ` (26 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Optimise io_file_get() with registered files, which is in a hot path,
by inlining parts of the function. Saves a function call, and
inefficiencies of passing arguments, e.g. evaluating
(sqe_flags & IOSQE_FIXED_FILE).

It couldn't have been done before as compilers were refusing to inline
it because of the function size.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 65 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5072f84ef99f..900c1a4d6a0a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1058,7 +1058,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_submit_state *state,
+static struct file *io_file_get(struct io_ring_ctx *ctx,
+				struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
@@ -3622,7 +3623,8 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (unlikely(sp->flags & ~valid_flags))
 		return -EINVAL;
 
-	sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+	sp->file_in = io_file_get(req->ctx, NULL, req,
+				  READ_ONCE(sqe->splice_fd_in),
 				  (sp->flags & SPLICE_F_FD_IN_FIXED));
 	if (!sp->file_in)
 		return -EBADF;
@@ -6354,36 +6356,48 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
 	file_slot->file_ptr = file_ptr;
 }
 
-static struct file *io_file_get(struct io_submit_state *state,
-				struct io_kiocb *req, int fd, bool fixed)
+static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
+					     struct io_kiocb *req, int fd)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file;
+	unsigned long file_ptr;
 
-	if (fixed) {
-		unsigned long file_ptr;
+	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+		return NULL;
+	fd = array_index_nospec(fd, ctx->nr_user_files);
+	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+	file = (struct file *) (file_ptr & FFS_MASK);
+	file_ptr &= ~FFS_MASK;
+	/* mask in overlapping REQ_F and FFS bits */
+	req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
+	io_req_set_rsrc_node(req);
+	return file;
+}
 
-		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
-			return NULL;
-		fd = array_index_nospec(fd, ctx->nr_user_files);
-		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
-		file = (struct file *) (file_ptr & FFS_MASK);
-		file_ptr &= ~FFS_MASK;
-		/* mask in overlapping REQ_F and FFS bits */
-		req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
-		io_req_set_rsrc_node(req);
-	} else {
-		trace_io_uring_file_get(ctx, fd);
-		file = __io_file_get(state, fd);
+static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
+				       struct io_submit_state *state,
+				       struct io_kiocb *req, int fd)
+{
+	struct file *file = __io_file_get(state, fd);
 
-		/* we don't allow fixed io_uring files */
-		if (file && unlikely(file->f_op == &io_uring_fops))
-			io_req_track_inflight(req);
-	}
+	trace_io_uring_file_get(ctx, fd);
 
+	/* we don't allow fixed io_uring files */
+	if (file && unlikely(file->f_op == &io_uring_fops))
+		io_req_track_inflight(req);
 	return file;
 }
 
+static inline struct file *io_file_get(struct io_ring_ctx *ctx,
+				       struct io_submit_state *state,
+				       struct io_kiocb *req, int fd, bool fixed)
+{
+	if (fixed)
+		return io_file_get_fixed(ctx, req, fd);
+	else
+		return io_file_get_normal(ctx, state, req, fd);
+}
+
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
@@ -6590,9 +6604,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	if (io_op_defs[req->opcode].needs_file) {
-		bool fixed = req->flags & REQ_F_FIXED_FILE;
-
-		req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+		req->file = io_file_get(ctx, state, req, READ_ONCE(sqe->fd),
+					(sqe_flags & IOSQE_FIXED_FILE));
 		if (unlikely(!req->file))
 			ret = -EBADF;
 	}
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 03/28] io_uring: rename io_file_supports_async()
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 01/28] io_uring: use kvmalloc for fixed files Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 02/28] io_uring: inline fixed part of io_file_get() Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 04/28] io_uring: avoid touching inode in rw prep Pavel Begunkov
                   ` (25 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

io_file_supports_async() checks whether a file supports nowait
operations, so "async" in the name is misleading. Rename it.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 900c1a4d6a0a..d34bba222039 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -718,8 +718,8 @@ enum {
 	REQ_F_DONT_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
 	/* keep async read/write and isreg together and in order */
-	REQ_F_ASYNC_READ_BIT,
-	REQ_F_ASYNC_WRITE_BIT,
+	REQ_F_NOWAIT_READ_BIT,
+	REQ_F_NOWAIT_WRITE_BIT,
 	REQ_F_ISREG_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
@@ -765,9 +765,9 @@ enum {
 	/* don't attempt request reissue, see io_rw_reissue() */
 	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
 	/* supports async reads */
-	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
+	REQ_F_NOWAIT_READ	= BIT(REQ_F_NOWAIT_READ_BIT),
 	/* supports async writes */
-	REQ_F_ASYNC_WRITE	= BIT(REQ_F_ASYNC_WRITE_BIT),
+	REQ_F_NOWAIT_WRITE	= BIT(REQ_F_NOWAIT_WRITE_BIT),
 	/* regular file */
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* has creds assigned */
@@ -2628,7 +2628,7 @@ static bool io_bdev_nowait(struct block_device *bdev)
  * any file. For now, just ensure that anything potentially problematic is done
  * inline.
  */
-static bool __io_file_supports_async(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, int rw)
 {
 	umode_t mode = file_inode(file)->i_mode;
 
@@ -2661,14 +2661,14 @@ static bool __io_file_supports_async(struct file *file, int rw)
 	return file->f_op->write_iter != NULL;
 }
 
-static bool io_file_supports_async(struct io_kiocb *req, int rw)
+static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
 {
-	if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
+	if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
 		return true;
-	else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
+	else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
 		return true;
 
-	return __io_file_supports_async(req->file, rw);
+	return __io_file_supports_nowait(req->file, rw);
 }
 
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3292,7 +3292,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	if (force_nonblock && !io_file_supports_async(req, READ)) {
+	if (force_nonblock && !io_file_supports_nowait(req, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 		return ret ?: -EAGAIN;
 	}
@@ -3399,7 +3399,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	if (force_nonblock && !io_file_supports_async(req, WRITE))
+	if (force_nonblock && !io_file_supports_nowait(req, WRITE))
 		goto copy_iov;
 
 	/* file path doesn't support NOWAIT for non-direct_IO */
@@ -5209,7 +5209,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 	}
 
 	/* if we can't nonblock try, then no point in arming a poll handler */
-	if (!io_file_supports_async(req, rw))
+	if (!io_file_supports_nowait(req, rw))
 		return IO_APOLL_ABORTED;
 
 	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@ -6347,9 +6347,9 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
 {
 	unsigned long file_ptr = (unsigned long) file;
 
-	if (__io_file_supports_async(file, READ))
+	if (__io_file_supports_nowait(file, READ))
 		file_ptr |= FFS_ASYNC_READ;
-	if (__io_file_supports_async(file, WRITE))
+	if (__io_file_supports_nowait(file, WRITE))
 		file_ptr |= FFS_ASYNC_WRITE;
 	if (S_ISREG(file_inode(file)->i_mode))
 		file_ptr |= FFS_ISREG;
@@ -6369,7 +6369,7 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
 	file = (struct file *) (file_ptr & FFS_MASK);
 	file_ptr &= ~FFS_MASK;
 	/* mask in overlapping REQ_F and FFS bits */
-	req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
+	req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
 	io_req_set_rsrc_node(req);
 	return file;
 }
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 04/28] io_uring: avoid touching inode in rw prep
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 03/28] io_uring: rename io_file_supports_async() Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 05/28] io_uring: clean io-wq callbacks Pavel Begunkov
                   ` (24 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

If we use fixed files, we can be sure (almost) that REQ_F_ISREG is set.
However, for non-reg files io_prep_rw() still will look into inode to
double check, and that's expensive and can be avoided.

The only caveat is that it only currently works with 64+ bit
architectures, see FFS_ISREG, so we should consider that.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d34bba222039..42cf69c6d9b6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1230,6 +1230,20 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
+#define FFS_ASYNC_READ		0x1UL
+#define FFS_ASYNC_WRITE		0x2UL
+#ifdef CONFIG_64BIT
+#define FFS_ISREG		0x4UL
+#else
+#define FFS_ISREG		0x0UL
+#endif
+#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+
+static inline bool io_req_ffs_set(struct io_kiocb *req)
+{
+	return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
+}
+
 static void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
@@ -2679,7 +2693,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	unsigned ioprio;
 	int ret;
 
-	if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
+	if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
@@ -6320,15 +6334,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	}
 }
 
-#define FFS_ASYNC_READ		0x1UL
-#define FFS_ASYNC_WRITE		0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG		0x4UL
-#else
-#define FFS_ISREG		0x0UL
-#endif
-#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
-
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
 						       unsigned i)
 {
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 05/28] io_uring: clean io-wq callbacks
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 04/28] io_uring: avoid touching inode in rw prep Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 06/28] io_uring: remove unnecessary PF_EXITING check Pavel Begunkov
                   ` (23 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Move io-wq callbacks closer to each other, so it's easier to work with
them, and rename io_free_work() into io_wq_free_work() for consistency.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 42cf69c6d9b6..8f18af509afd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6299,6 +6299,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+	req = io_put_req_find_next(req);
+	return req ? &req->work : NULL;
+}
+
 static void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -7936,14 +7944,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	return done ? done : err;
 }
 
-static struct io_wq_work *io_free_work(struct io_wq_work *work)
-{
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-	req = io_put_req_find_next(req);
-	return req ? &req->work : NULL;
-}
-
 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 					struct task_struct *task)
 {
@@ -7967,7 +7967,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 
 	data.hash = hash;
 	data.task = task;
-	data.free_work = io_free_work;
+	data.free_work = io_wq_free_work;
 	data.do_work = io_wq_submit_work;
 
 	/* Do QD, or 4 * CPUS, whatever is smallest */
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 06/28] io_uring: remove unnecessary PF_EXITING check
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (4 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 05/28] io_uring: clean io-wq callbacks Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 07/28] io-wq: improve wq_list_add_tail() Pavel Begunkov
                   ` (22 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We prefer nornal task_works even if it would fail requests inside. Kill
a PF_EXITING check in io_req_task_work_add(), task_work_add() handles
well dying tasks, i.e. return error when can't enqueue due to late
stages of do_exit().

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8f18af509afd..ba1df6ae6024 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2003,8 +2003,6 @@ static void io_req_task_work_add(struct io_kiocb *req)
 	if (test_bit(0, &tctx->task_state) ||
 	    test_and_set_bit(0, &tctx->task_state))
 		return;
-	if (unlikely(tsk->flags & PF_EXITING))
-		goto fail;
 
 	/*
 	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
@@ -2017,7 +2015,7 @@ static void io_req_task_work_add(struct io_kiocb *req)
 		wake_up_process(tsk);
 		return;
 	}
-fail:
+
 	clear_bit(0, &tctx->task_state);
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	node = tctx->task_list.first;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 07/28] io-wq: improve wq_list_add_tail()
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (5 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 06/28] io_uring: remove unnecessary PF_EXITING check Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 08/28] io_uring: refactor io_alloc_req Pavel Begunkov
                   ` (21 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Prepare nodes that we're going to add before actually linking them, it's
always safer and costs us nothing.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io-wq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3999ee58ff26..308af3928424 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
 static inline void wq_list_add_tail(struct io_wq_work_node *node,
 				    struct io_wq_work_list *list)
 {
+	node->next = NULL;
 	if (!list->first) {
 		list->last = node;
 		WRITE_ONCE(list->first, node);
@@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 		list->last->next = node;
 		list->last = node;
 	}
-	node->next = NULL;
 }
 
 static inline void wq_list_cut(struct io_wq_work_list *list,
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 08/28] io_uring: refactor io_alloc_req
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (6 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 07/28] io-wq: improve wq_list_add_tail() Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 09/28] io_uring: don't halt iopoll too early Pavel Begunkov
                   ` (20 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Replace the main if of io_flush_cached_reqs() with inverted condition +
goto, so all the cases are handled in the same way. And also extract
io_preinit_req() to make it cleaner and easier to refer to.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 66 +++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ba1df6ae6024..80d7f79db911 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1699,6 +1699,19 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
 	io_req_complete_post(req, res, 0);
 }
 
+/*
+ * Don't initialise the fields below on every allocation, but do that in
+ * advance and keep them valid across allocations.
+ */
+static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+	req->ctx = ctx;
+	req->link = NULL;
+	req->async_data = NULL;
+	/* not necessary, but safer to zero */
+	req->result = 0;
+}
+
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 					struct io_comp_state *cs)
 {
@@ -1741,45 +1754,31 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
+	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+	int ret, i;
 
 	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
 
-	if (!state->free_reqs) {
-		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
-		int ret, i;
-
-		if (io_flush_cached_reqs(ctx))
-			goto got_req;
-
-		ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
-					    state->reqs);
-
-		/*
-		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
-		 * retry single alloc to be on the safe side.
-		 */
-		if (unlikely(ret <= 0)) {
-			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-			if (!state->reqs[0])
-				return NULL;
-			ret = 1;
-		}
+	if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
+		goto got_req;
 
-		/*
-		 * Don't initialise the fields below on every allocation, but
-		 * do that in advance and keep valid on free.
-		 */
-		for (i = 0; i < ret; i++) {
-			struct io_kiocb *req = state->reqs[i];
+	ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
+				    state->reqs);
 
-			req->ctx = ctx;
-			req->link = NULL;
-			req->async_data = NULL;
-			/* not necessary, but safer to zero */
-			req->result = 0;
-		}
-		state->free_reqs = ret;
+	/*
+	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+	 * retry single alloc to be on the safe side.
+	 */
+	if (unlikely(ret <= 0)) {
+		state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+		if (!state->reqs[0])
+			return NULL;
+		ret = 1;
 	}
+
+	for (i = 0; i < ret; i++)
+		io_preinit_req(state->reqs[i], ctx);
+	state->free_reqs = ret;
 got_req:
 	state->free_reqs--;
 	return state->reqs[state->free_reqs];
@@ -6570,6 +6569,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	unsigned int sqe_flags;
 	int personality, ret = 0;
 
+	/* req is partially pre-initialised, see io_preinit_req() */
 	req->opcode = READ_ONCE(sqe->opcode);
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags = sqe_flags = READ_ONCE(sqe->flags);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 09/28] io_uring: don't halt iopoll too early
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (7 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 08/28] io_uring: refactor io_alloc_req Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 10/28] io_uring: add more locking annotations for submit Pavel Begunkov
                   ` (19 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

IOPOLL users should care more about getting completions for requests
they submitted, but not in "device did/completed something". Currently,
io_do_iopoll() may return a positive number, which will instruct
io_iopoll_check() to break the loop and end the syscall, even if there
is not enough CQEs or none at all.

Don't return positive numbers, so io_iopoll_check() exits only when it
gets an actual error, need reschedule or got enough CQEs.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 80d7f79db911..911a223a90e1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2285,7 +2285,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	struct io_kiocb *req, *tmp;
 	LIST_HEAD(done);
 	bool spin;
-	int ret;
 
 	/*
 	 * Only spin for completions if we don't have multiple devices hanging
@@ -2293,9 +2292,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	 */
 	spin = !ctx->poll_multi_queue && *nr_events < min;
 
-	ret = 0;
 	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
 		struct kiocb *kiocb = &req->rw.kiocb;
+		int ret;
 
 		/*
 		 * Move completed and retryable entries to our local lists.
@@ -2310,22 +2309,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			break;
 
 		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
-		if (ret < 0)
-			break;
+		if (unlikely(ret < 0))
+			return ret;
+		else if (ret)
+			spin = false;
 
 		/* iopoll may have completed current req */
 		if (READ_ONCE(req->iopoll_completed))
 			list_move_tail(&req->inflight_entry, &done);
-
-		if (ret && spin)
-			spin = false;
-		ret = 0;
 	}
 
 	if (!list_empty(&done))
 		io_iopoll_complete(ctx, nr_events, &done, resubmit);
 
-	return ret;
+	return 0;
 }
 
 /*
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 10/28] io_uring: add more locking annotations for submit
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (8 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 09/28] io_uring: don't halt iopoll too early Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 11/28] io_uring: optimise io_cqring_wait() hot path Pavel Begunkov
                   ` (18 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Add more annotations for submission path functions holding ->uring_lock.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 911a223a90e1..0f49736cd2b4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2130,6 +2130,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_comp_state *cs = &ctx->submit_state.comp;
 	int i, nr = cs->nr;
@@ -6474,6 +6475,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 }
 
 static void __io_queue_sqe(struct io_kiocb *req)
+	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
 	int ret;
@@ -6517,6 +6519,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 }
 
 static inline void io_queue_sqe(struct io_kiocb *req)
+	__must_hold(&req->ctx->uring_lock)
 {
 	if (unlikely(req->ctx->drain_active) && io_drain_req(req))
 		return;
@@ -6561,6 +6564,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 
 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		       const struct io_uring_sqe *sqe)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_state *state;
 	unsigned int sqe_flags;
@@ -6624,6 +6628,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			 const struct io_uring_sqe *sqe)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_submit_link *link = &ctx->submit_state.link;
 	int ret;
@@ -6756,6 +6761,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 }
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_uring_task *tctx;
 	int submitted = 0;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 11/28] io_uring: optimise io_cqring_wait() hot path
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (9 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 10/28] io_uring: add more locking annotations for submit Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 12/28] io_uring: extract a helper for ctx quiesce Pavel Begunkov
                   ` (17 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Turns out we always init struct io_wait_queue in io_cqring_wait(), even
if it's not used after, i.e. there are already enough of CQEs. And often
it's exactly what happens, for instance, requests may have been
completed inline, or in case of io_uring_enter(submit=N, wait=1).

It shows up in my profiler, so optimise it by delaying the struct init.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0f49736cd2b4..0fd04d25c520 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7062,15 +7062,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz,
 			  struct __kernel_timespec __user *uts)
 {
-	struct io_wait_queue iowq = {
-		.wq = {
-			.private	= current,
-			.func		= io_wake_function,
-			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
-		},
-		.ctx		= ctx,
-		.to_wait	= min_events,
-	};
+	struct io_wait_queue iowq;
 	struct io_rings *rings = ctx->rings;
 	signed long timeout = MAX_SCHEDULE_TIMEOUT;
 	int ret;
@@ -7104,7 +7096,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		timeout = timespec64_to_jiffies(&ts);
 	}
 
+	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
+	iowq.wq.private = current;
+	INIT_LIST_HEAD(&iowq.wq.entry);
+	iowq.ctx = ctx;
+	iowq.to_wait = min_events;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		/* if we can't even flush overflow, don't wait for more */
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 12/28] io_uring: extract a helper for ctx quiesce
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (10 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 11/28] io_uring: optimise io_cqring_wait() hot path Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 13/28] io_uring: move io_put_task() definition Pavel Begunkov
                   ` (16 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Refactor __io_uring_register() by extracting a helper responsible for
ctx queisce. Looks better and will make it easier to add more
optimisations.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 53 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0fd04d25c520..292dbf10e316 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10077,6 +10077,33 @@ static bool io_register_op_must_quiesce(int op)
 	}
 }
 
+static int io_ctx_quiesce(struct io_ring_ctx *ctx)
+{
+	long ret;
+
+	percpu_ref_kill(&ctx->refs);
+
+	/*
+	 * Drop uring mutex before waiting for references to exit. If another
+	 * thread is currently inside io_uring_enter() it might need to grab the
+	 * uring_lock to make progress. If we hold it here across the drain
+	 * wait, then we can deadlock. It's safe to drop the mutex here, since
+	 * no new references will come in after we've killed the percpu ref.
+	 */
+	mutex_unlock(&ctx->uring_lock);
+	do {
+		ret = wait_for_completion_interruptible(&ctx->ref_comp);
+		if (!ret)
+			break;
+		ret = io_run_task_work_sig();
+	} while (ret >= 0);
+	mutex_lock(&ctx->uring_lock);
+
+	if (ret)
+		io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -10101,31 +10128,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	}
 
 	if (io_register_op_must_quiesce(opcode)) {
-		percpu_ref_kill(&ctx->refs);
-
-		/*
-		 * Drop uring mutex before waiting for references to exit. If
-		 * another thread is currently inside io_uring_enter() it might
-		 * need to grab the uring_lock to make progress. If we hold it
-		 * here across the drain wait, then we can deadlock. It's safe
-		 * to drop the mutex here, since no new references will come in
-		 * after we've killed the percpu ref.
-		 */
-		mutex_unlock(&ctx->uring_lock);
-		do {
-			ret = wait_for_completion_interruptible(&ctx->ref_comp);
-			if (!ret)
-				break;
-			ret = io_run_task_work_sig();
-			if (ret < 0)
-				break;
-		} while (1);
-		mutex_lock(&ctx->uring_lock);
-
-		if (ret) {
-			io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+		ret = io_ctx_quiesce(ctx);
+		if (ret)
 			return ret;
-		}
 	}
 
 	switch (opcode) {
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 13/28] io_uring: move io_put_task() definition
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (11 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 12/28] io_uring: extract a helper for ctx quiesce Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 14/28] io_uring: move io_rsrc_node_alloc() definition Pavel Begunkov
                   ` (15 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Move the function in the source file as it is to get rid of forward
declarations.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 292dbf10e316..fc778724acd5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1051,7 +1051,6 @@ static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 static void io_dismantle_req(struct io_kiocb *req);
-static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1570,6 +1569,17 @@ static inline void req_ref_get(struct io_kiocb *req)
 	atomic_inc(&req->refs);
 }
 
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+	struct io_uring_task *tctx = task->io_uring;
+
+	percpu_counter_sub(&tctx->inflight, nr);
+	if (unlikely(atomic_read(&tctx->in_idle)))
+		wake_up(&tctx->wait);
+	put_task_struct_many(task, nr);
+}
+
 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 				     long res, unsigned int cflags)
 {
@@ -1806,17 +1816,6 @@ static void io_dismantle_req(struct io_kiocb *req)
 	}
 }
 
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
-	struct io_uring_task *tctx = task->io_uring;
-
-	percpu_counter_sub(&tctx->inflight, nr);
-	if (unlikely(atomic_read(&tctx->in_idle)))
-		wake_up(&tctx->wait);
-	put_task_struct_many(task, nr);
-}
-
 static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 14/28] io_uring: move io_rsrc_node_alloc() definition
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (12 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 13/28] io_uring: move io_put_task() definition Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 15/28] io_uring: inline io_free_req_deferred Pavel Begunkov
                   ` (14 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Move the function together with io_rsrc_node_ref_zero() in the source
file as it is to get rid of forward declarations.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 87 +++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fc778724acd5..1237e6e87ff2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1044,7 +1044,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 				 long res, unsigned int cflags);
@@ -7169,6 +7168,49 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 	kfree(ref_node);
 }
 
+static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+{
+	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
+	bool first_add = false;
+
+	io_rsrc_ref_lock(ctx);
+	node->done = true;
+
+	while (!list_empty(&ctx->rsrc_ref_list)) {
+		node = list_first_entry(&ctx->rsrc_ref_list,
+					    struct io_rsrc_node, node);
+		/* recycle ref nodes in order */
+		if (!node->done)
+			break;
+		list_del(&node->node);
+		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
+	}
+	io_rsrc_ref_unlock(ctx);
+
+	if (first_add)
+		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
+}
+
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+{
+	struct io_rsrc_node *ref_node;
+
+	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+	if (!ref_node)
+		return NULL;
+
+	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
+			    0, GFP_KERNEL)) {
+		kfree(ref_node);
+		return NULL;
+	}
+	INIT_LIST_HEAD(&ref_node->node);
+	INIT_LIST_HEAD(&ref_node->rsrc_list);
+	ref_node->done = false;
+	return ref_node;
+}
+
 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 				struct io_rsrc_data *data_to_kill)
 {
@@ -7681,49 +7723,6 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
-{
-	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
-	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
-	bool first_add = false;
-
-	io_rsrc_ref_lock(ctx);
-	node->done = true;
-
-	while (!list_empty(&ctx->rsrc_ref_list)) {
-		node = list_first_entry(&ctx->rsrc_ref_list,
-					    struct io_rsrc_node, node);
-		/* recycle ref nodes in order */
-		if (!node->done)
-			break;
-		list_del(&node->node);
-		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
-	}
-	io_rsrc_ref_unlock(ctx);
-
-	if (first_add)
-		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
-}
-
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *ref_node;
-
-	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-	if (!ref_node)
-		return NULL;
-
-	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
-			    0, GFP_KERNEL)) {
-		kfree(ref_node);
-		return NULL;
-	}
-	INIT_LIST_HEAD(&ref_node->node);
-	INIT_LIST_HEAD(&ref_node->rsrc_list);
-	ref_node->done = false;
-	return ref_node;
-}
-
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args, u64 __user *tags)
 {
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 15/28] io_uring: inline io_free_req_deferred
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (13 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 14/28] io_uring: move io_rsrc_node_alloc() definition Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 16/28] io_uring: deduplicate open iopoll check Pavel Begunkov
                   ` (13 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Inline io_free_req_deferred(), there is no reason to keep it separated.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1237e6e87ff2..17ead2a7e899 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2179,16 +2179,12 @@ static inline void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static void io_free_req_deferred(struct io_kiocb *req)
-{
-	req->io_task_work.func = io_free_req;
-	io_req_task_work_add(req);
-}
-
 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 {
-	if (req_ref_sub_and_test(req, refs))
-		io_free_req_deferred(req);
+	if (req_ref_sub_and_test(req, refs)) {
+		req->io_task_work.func = io_free_req;
+		io_req_task_work_add(req);
+	}
 }
 
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 16/28] io_uring: deduplicate open iopoll check
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (14 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 15/28] io_uring: inline io_free_req_deferred Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 17/28] io_uring: improve ctx hang handling Pavel Begunkov
                   ` (12 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Move IORING_SETUP_IOPOLL check into __io_openat_prep(), so both openat
and openat2 reuse it.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17ead2a7e899..cbd39ac2e92b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3790,6 +3790,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	const char __user *fname;
 	int ret;
 
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
 	if (unlikely(sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 	if (unlikely(req->flags & REQ_F_FIXED_FILE))
@@ -3814,12 +3816,9 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	u64 flags, mode;
+	u64 mode = READ_ONCE(sqe->len);
+	u64 flags = READ_ONCE(sqe->open_flags);
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	mode = READ_ONCE(sqe->len);
-	flags = READ_ONCE(sqe->open_flags);
 	req->open.how = build_open_how(flags, mode);
 	return __io_openat_prep(req, sqe);
 }
@@ -3830,8 +3829,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	size_t len;
 	int ret;
 
-	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
 	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	len = READ_ONCE(sqe->len);
 	if (len < OPEN_HOW_SIZE_VER0)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 17/28] io_uring: improve ctx hang handling
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (15 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 16/28] io_uring: deduplicate open iopoll check Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 18/28] io_uring: kill unused IO_IOPOLL_BATCH Pavel Begunkov
                   ` (11 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

If io_ring_exit_work() can't get it done in 5 minutes, something is
going very wrong, don't keep spinning at HZ / 20 rate, it doesn't help
and it may take much of CPU time if there is a lot of workers stuck as
such.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cbd39ac2e92b..a6fe8332d3fb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8800,6 +8800,7 @@ static void io_ring_exit_work(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
 	unsigned long timeout = jiffies + HZ * 60 * 5;
+	unsigned long interval = HZ / 20;
 	struct io_tctx_exit exit;
 	struct io_tctx_node *node;
 	int ret;
@@ -8824,8 +8825,11 @@ static void io_ring_exit_work(struct work_struct *work)
 			io_sq_thread_unpark(sqd);
 		}
 
-		WARN_ON_ONCE(time_after(jiffies, timeout));
-	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
+			/* there is little hope left, don't run it too often */
+			interval = HZ * 60;
+		}
+	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
 
 	init_completion(&exit.completion);
 	init_task_work(&exit.task_work, io_tctx_exit_cb);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 18/28] io_uring: kill unused IO_IOPOLL_BATCH
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (16 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 17/28] io_uring: improve ctx hang handling Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 19/28] io_uring: drop exec checks from io_req_task_submit Pavel Begunkov
                   ` (10 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

IO_IOPOLL_BATCH is not used, delete it.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a6fe8332d3fb..ba0c61d42802 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -294,7 +294,6 @@ struct io_sq_data {
 	struct completion	exited;
 };
 
-#define IO_IOPOLL_BATCH			8
 #define IO_COMPL_BATCH			32
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 19/28] io_uring: drop exec checks from io_req_task_submit
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (17 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 18/28] io_uring: kill unused IO_IOPOLL_BATCH Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 20/28] io_uring: optimise putting task struct Pavel Begunkov
                   ` (9 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

In case of on-exec io_uring cancellations, tasks already wait for all
submitted requests to get completed/cancelled, so we don't need to check
for ->in_execve separately.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ba0c61d42802..3c5c4cf73d1c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2043,7 +2043,7 @@ static void io_req_task_submit(struct io_kiocb *req)
 
 	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 	mutex_lock(&ctx->uring_lock);
-	if (!(req->task->flags & PF_EXITING) && !req->task->in_execve)
+	if (likely(!(req->task->flags & PF_EXITING)))
 		__io_queue_sqe(req);
 	else
 		io_req_complete_failed(req, -EFAULT);
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 20/28] io_uring: optimise putting task struct
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (18 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 19/28] io_uring: drop exec checks from io_req_task_submit Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 21/28] io_uring: hide async dadta behind flags Pavel Begunkov
                   ` (8 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We cache all the reference to task + tctx, so if io_put_task() is
called by the corresponding task itself, we can save on atomics and
return the refs right back into the cache.

It's beneficial for all inline completions, and also iopolling, when
polling and submissions are done by the same task, including
SQPOLL|IOPOLL.

Note: io_uring_cancel_generic() can return refs to the cache as well,
so those should be flushed in the loop for tctx_inflight() to work
right.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c5c4cf73d1c..0982b0dba6b0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2099,10 +2099,12 @@ static inline void io_init_req_batch(struct req_batch *rb)
 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 				     struct req_batch *rb)
 {
-	if (rb->task)
-		io_put_task(rb->task, rb->task_refs);
 	if (rb->ctx_refs)
 		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
+	if (rb->task == current)
+		current->io_uring->cached_refs += rb->task_refs;
+	else if (rb->task)
+		io_put_task(rb->task, rb->task_refs);
 }
 
 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
@@ -9143,9 +9145,11 @@ static void io_uring_drop_tctx_refs(struct task_struct *task)
 	struct io_uring_task *tctx = task->io_uring;
 	unsigned int refs = tctx->cached_refs;
 
-	tctx->cached_refs = 0;
-	percpu_counter_sub(&tctx->inflight, refs);
-	put_task_struct_many(task, refs);
+	if (refs) {
+		tctx->cached_refs = 0;
+		percpu_counter_sub(&tctx->inflight, refs);
+		put_task_struct_many(task, refs);
+	}
 }
 
 /*
@@ -9166,9 +9170,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 	if (tctx->io_wq)
 		io_wq_exit_start(tctx->io_wq);
 
-	io_uring_drop_tctx_refs(current);
 	atomic_inc(&tctx->in_idle);
 	do {
+		io_uring_drop_tctx_refs(current);
 		/* read completions before cancelations */
 		inflight = tctx_inflight(tctx, !cancel_all);
 		if (!inflight)
@@ -9192,6 +9196,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 		}
 
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+		io_uring_drop_tctx_refs(current);
 		/*
 		 * If we've seen completions, retry without waiting. This
 		 * avoids a race where a completion comes in before we did
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 21/28] io_uring: hide async dadta behind flags
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (19 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 20/28] io_uring: optimise putting task struct Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 17:30   ` Jens Axboe
  2021-08-09 12:04 ` [PATCH 22/28] io_uring: move io_fallback_req_func() Pavel Begunkov
                   ` (7 subsequent siblings)
  28 siblings, 1 reply; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Checking flags is a bit faster and can be batched, but the main reason
of controlling ->async_data with req->flags but not relying on NULL is
that we safely move it now to the end of io_kiocb, where cachelines are
rarely loaded, and use that freed space for something more hot like
io_mapped_ubuf.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 70 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0982b0dba6b0..9e359acf2f51 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -106,7 +106,8 @@
 				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 				IOSQE_BUFFER_SELECT)
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
-				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
+				REQ_F_ASYNC_DATA)
 
 #define IO_TCTX_REFS_CACHE_NR	(1U << 10)
 
@@ -716,6 +717,7 @@ enum {
 	REQ_F_REISSUE_BIT,
 	REQ_F_DONT_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
+	REQ_F_ASYNC_DATA_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_NOWAIT_READ_BIT,
 	REQ_F_NOWAIT_WRITE_BIT,
@@ -771,6 +773,8 @@ enum {
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* has creds assigned */
 	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
+	/* ->async_data allocated */
+	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
 };
 
 struct async_poll {
@@ -828,8 +832,6 @@ struct io_kiocb {
 		struct io_completion	compl;
 	};
 
-	/* opcode allocated if it needs to store data for async defer */
-	void				*async_data;
 	u8				opcode;
 	/* polled IO has completed */
 	u8				iopoll_completed;
@@ -845,6 +847,14 @@ struct io_kiocb {
 
 	struct io_kiocb			*link;
 	struct percpu_ref		*fixed_rsrc_refs;
+	/* store used ubuf, so we can prevent reloading */
+	struct io_mapped_ubuf		*imu;
+
+	/*
+	 * Opcode allocated if it needs to store data for async defer,
+	 * only valid if REQ_F_ASYNC_DATA is set
+	 */
+	void				*async_data;
 
 	/* used with ctx->iopoll_list with reads/writes */
 	struct list_head		inflight_entry;
@@ -853,10 +863,8 @@ struct io_kiocb {
 	struct hlist_node		hash_node;
 	struct async_poll		*apoll;
 	struct io_wq_work		work;
+	/* only valid when REQ_F_CREDS is set */
 	const struct cred		*creds;
-
-	/* store used ubuf, so we can prevent reloading */
-	struct io_mapped_ubuf		*imu;
 };
 
 struct io_tctx_node {
@@ -1127,6 +1135,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
 	return false;
 }
 
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+	return req->flags & REQ_F_ASYNC_DATA;
+}
+
 static inline void req_set_fail(struct io_kiocb *req)
 {
 	req->flags |= REQ_F_FAIL;
@@ -1808,10 +1821,6 @@ static void io_dismantle_req(struct io_kiocb *req)
 		io_put_file(req->file);
 	if (req->fixed_rsrc_refs)
 		percpu_ref_put(req->fixed_rsrc_refs);
-	if (req->async_data) {
-		kfree(req->async_data);
-		req->async_data = NULL;
-	}
 }
 
 static void __io_free_req(struct io_kiocb *req)
@@ -2422,7 +2431,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *rw = req->async_data;
 
-	if (!rw)
+	if (!req_has_async_data(req))
 		return !io_req_prep_async(req);
 	if (rw->iter.truncated)
 		return false;
@@ -2766,7 +2775,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 	bool check_reissue = kiocb->ki_complete == io_complete_rw;
 
 	/* add previously done IO, if any */
-	if (io && io->bytes_done > 0) {
+	if (req_has_async_data(req) && io->bytes_done > 0) {
 		if (ret < 0)
 			ret = io->bytes_done;
 		else
@@ -3141,6 +3150,8 @@ static inline int io_alloc_async_data(struct io_kiocb *req)
 {
 	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
 	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
+	if (req->async_data)
+		req->flags |= REQ_F_ASYNC_DATA;
 	return req->async_data == NULL;
 }
 
@@ -3150,7 +3161,7 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 {
 	if (!force && !io_op_defs[req->opcode].needs_async_setup)
 		return 0;
-	if (!req->async_data) {
+	if (!req_has_async_data(req)) {
 		if (io_alloc_async_data(req)) {
 			kfree(iovec);
 			return -ENOMEM;
@@ -3274,11 +3285,12 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
 	struct iov_iter __iter, *iter = &__iter;
-	struct io_async_rw *rw = req->async_data;
+	struct io_async_rw *rw = NULL;
 	ssize_t io_size, ret, ret2;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
-	if (rw) {
+	if (req_has_async_data(req)) {
+		rw = req->async_data;
 		iter = &rw->iter;
 		iovec = NULL;
 	} else {
@@ -3381,11 +3393,12 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
 	struct iov_iter __iter, *iter = &__iter;
-	struct io_async_rw *rw = req->async_data;
+	struct io_async_rw *rw = NULL;
 	ssize_t ret, ret2, io_size;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
-	if (rw) {
+	if (req_has_async_data(req)) {
+		rw = req->async_data;
 		iter = &rw->iter;
 		iovec = NULL;
 	} else {
@@ -4385,8 +4398,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	kmsg = req->async_data;
-	if (!kmsg) {
+	if (req_has_async_data(req)) {
+		kmsg = req->async_data;
+	} else {
 		ret = io_sendmsg_copy_hdr(req, &iomsg);
 		if (ret)
 			return ret;
@@ -4609,8 +4623,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	kmsg = req->async_data;
-	if (!kmsg) {
+	if (req_has_async_data(req)) {
+		kmsg = req->async_data;
+	} else {
 		ret = io_recvmsg_copy_hdr(req, &iomsg);
 		if (ret)
 			return ret;
@@ -4776,7 +4791,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
-	if (req->async_data) {
+	if (req_has_async_data(req)) {
 		io = req->async_data;
 	} else {
 		ret = move_addr_to_kernel(req->connect.addr,
@@ -4792,7 +4807,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 	ret = __sys_connect_file(req->file, &io->address,
 					req->connect.addr_len, file_flags);
 	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
-		if (req->async_data)
+		if (req_has_async_data(req))
 			return -EAGAIN;
 		if (io_alloc_async_data(req)) {
 			ret = -ENOMEM;
@@ -5675,7 +5690,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (unlikely(off && !req->ctx->off_timeout_used))
 		req->ctx->off_timeout_used = true;
 
-	if (!req->async_data && io_alloc_async_data(req))
+	if (!req_has_async_data(req) && io_alloc_async_data(req))
 		return -ENOMEM;
 
 	data = req->async_data;
@@ -5990,7 +6005,7 @@ static int io_req_prep_async(struct io_kiocb *req)
 {
 	if (!io_op_defs[req->opcode].needs_async_setup)
 		return 0;
-	if (WARN_ON_ONCE(req->async_data))
+	if (WARN_ON_ONCE(req_has_async_data(req)))
 		return -EFAULT;
 	if (io_alloc_async_data(req))
 		return -EAGAIN;
@@ -6156,7 +6171,10 @@ static void io_clean_op(struct io_kiocb *req)
 	}
 	if (req->flags & REQ_F_CREDS)
 		put_cred(req->creds);
-
+	if (req->flags & REQ_F_ASYNC_DATA) {
+		kfree(req->async_data);
+		req->async_data = NULL;
+	}
 	req->flags &= ~IO_REQ_CLEAN_FLAGS;
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 22/28] io_uring: move io_fallback_req_func()
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (20 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 21/28] io_uring: hide async dadta behind flags Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 23/28] io_uring: cache __io_free_req()'d requests Pavel Begunkov
                   ` (6 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Move io_fallback_req_func() to kill yet another forward declaration.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9e359acf2f51..8b07bdb11430 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1074,8 +1074,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx);
 static bool io_poll_remove_waitqs(struct io_kiocb *req);
 static int io_req_prep_async(struct io_kiocb *req);
 
-static void io_fallback_req_func(struct work_struct *unused);
-
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -1157,6 +1155,17 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 	return !req->timeout.off;
 }
 
+static void io_fallback_req_func(struct work_struct *work)
+{
+	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+						fallback_work.work);
+	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
+	struct io_kiocb *req, *tmp;
+
+	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
+		req->io_task_work.func(req);
+}
+
 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
@@ -2476,17 +2485,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 }
 #endif
 
-static void io_fallback_req_func(struct work_struct *work)
-{
-	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
-						fallback_work.work);
-	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
-	struct io_kiocb *req, *tmp;
-
-	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
-		req->io_task_work.func(req);
-}
-
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 			     unsigned int issue_flags)
 {
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 23/28] io_uring: cache __io_free_req()'d requests
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (21 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 22/28] io_uring: move io_fallback_req_func() Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 24/28] io_uring: remove redundant args from cache_free Pavel Begunkov
                   ` (5 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Don't kfree requests in __io_free_req() but put them back into the
internal request cache. That makes allocations more sustainable and will
be used for refcounting optimisations.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8b07bdb11430..ec1cab2b9a91 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1835,11 +1835,16 @@ static void io_dismantle_req(struct io_kiocb *req)
 static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned long flags;
 
 	io_dismantle_req(req);
 	io_put_task(req->task, 1);
 
-	kmem_cache_free(req_cachep, req);
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	list_add(&req->compl.list, &ctx->locked_free_list);
+	ctx->locked_free_nr++;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
 	percpu_ref_put(&ctx->refs);
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 24/28] io_uring: remove redundant args from cache_free
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (22 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 23/28] io_uring: cache __io_free_req()'d requests Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 25/28] io_uring: use inflight_entry instead of compl.list Pavel Begunkov
                   ` (4 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We don't use @tsk argument of io_req_cache_free(), remove it.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ec1cab2b9a91..92854f62ee21 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8646,13 +8646,11 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 		__io_remove_buffers(ctx, buf, index, -1U);
 }
 
-static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
+static void io_req_cache_free(struct list_head *list)
 {
 	struct io_kiocb *req, *nxt;
 
 	list_for_each_entry_safe(req, nxt, list, compl.list) {
-		if (tsk && req->task != tsk)
-			continue;
 		list_del(&req->compl.list);
 		kmem_cache_free(req_cachep, req);
 	}
@@ -8672,7 +8670,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	}
 
 	io_flush_cached_locked_reqs(ctx, cs);
-	io_req_cache_free(&cs->free_list, NULL);
+	io_req_cache_free(&cs->free_list);
 	mutex_unlock(&ctx->uring_lock);
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 25/28] io_uring: use inflight_entry instead of compl.list
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (23 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 24/28] io_uring: remove redundant args from cache_free Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 26/28] io_uring: inline struct io_comp_state Pavel Begunkov
                   ` (3 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

req->compl.list is used to cache freed requests, and so can't overlap in
time with req->inflight_entry. So, use inflight_entry to link requests
and remove compl.list.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 92854f62ee21..aaddbb4ce4ef 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -670,7 +670,6 @@ struct io_unlink {
 
 struct io_completion {
 	struct file			*file;
-	struct list_head		list;
 	u32				cflags;
 };
 
@@ -1678,7 +1677,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 		}
 		io_dismantle_req(req);
 		io_put_task(req->task, 1);
-		list_add(&req->compl.list, &ctx->locked_free_list);
+		list_add(&req->inflight_entry, &ctx->locked_free_list);
 		ctx->locked_free_nr++;
 	} else {
 		if (!percpu_ref_tryget(&ctx->refs))
@@ -1769,9 +1768,9 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	nr = state->free_reqs;
 	while (!list_empty(&cs->free_list)) {
 		struct io_kiocb *req = list_first_entry(&cs->free_list,
-						struct io_kiocb, compl.list);
+					struct io_kiocb, inflight_entry);
 
-		list_del(&req->compl.list);
+		list_del(&req->inflight_entry);
 		state->reqs[nr++] = req;
 		if (nr == ARRAY_SIZE(state->reqs))
 			break;
@@ -1841,7 +1840,7 @@ static void __io_free_req(struct io_kiocb *req)
 	io_put_task(req->task, 1);
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	list_add(&req->compl.list, &ctx->locked_free_list);
+	list_add(&req->inflight_entry, &ctx->locked_free_list);
 	ctx->locked_free_nr++;
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -2148,7 +2147,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 		state->reqs[state->free_reqs++] = req;
 	else
-		list_add(&req->compl.list, &state->comp.free_list);
+		list_add(&req->inflight_entry, &state->comp.free_list);
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
@@ -8650,8 +8649,8 @@ static void io_req_cache_free(struct list_head *list)
 {
 	struct io_kiocb *req, *nxt;
 
-	list_for_each_entry_safe(req, nxt, list, compl.list) {
-		list_del(&req->compl.list);
+	list_for_each_entry_safe(req, nxt, list, inflight_entry) {
+		list_del(&req->inflight_entry);
 		kmem_cache_free(req_cachep, req);
 	}
 }
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 26/28] io_uring: inline struct io_comp_state
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (24 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 25/28] io_uring: use inflight_entry instead of compl.list Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 27/28] io_uring: remove extra argument for overflow flush Pavel Begunkov
                   ` (2 subsequent siblings)
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Inline struct io_comp_state into struct io_submit_state. They are
already coupled tightly, together with mixed responsibilities it
only brings confusion having them separately.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 61 +++++++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index aaddbb4ce4ef..749c0712d98e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -299,13 +299,6 @@ struct io_sq_data {
 #define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
-struct io_comp_state {
-	struct io_kiocb		*reqs[IO_COMPL_BATCH];
-	unsigned int		nr;
-	/* inline/task_work completion list, under ->uring_lock */
-	struct list_head	free_list;
-};
-
 struct io_submit_link {
 	struct io_kiocb		*head;
 	struct io_kiocb		*last;
@@ -326,7 +319,10 @@ struct io_submit_state {
 	/*
 	 * Batch completion logic
 	 */
-	struct io_comp_state	comp;
+	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
+	unsigned int		compl_nr;
+	/* inline/task_work completion list, under ->uring_lock */
+	struct list_head	free_list;
 
 	/*
 	 * File reference cache
@@ -1218,7 +1214,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
-	INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
+	INIT_LIST_HEAD(&ctx->submit_state.free_list);
 	INIT_LIST_HEAD(&ctx->locked_free_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	return ctx;
@@ -1742,10 +1738,10 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 }
 
 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
-					struct io_comp_state *cs)
+					struct io_submit_state *state)
 {
 	spin_lock_irq(&ctx->completion_lock);
-	list_splice_init(&ctx->locked_free_list, &cs->free_list);
+	list_splice_init(&ctx->locked_free_list, &state->free_list);
 	ctx->locked_free_nr = 0;
 	spin_unlock_irq(&ctx->completion_lock);
 }
@@ -1754,7 +1750,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
-	struct io_comp_state *cs = &state->comp;
 	int nr;
 
 	/*
@@ -1763,11 +1758,11 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	 * side cache.
 	 */
 	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
-		io_flush_cached_locked_reqs(ctx, cs);
+		io_flush_cached_locked_reqs(ctx, state);
 
 	nr = state->free_reqs;
-	while (!list_empty(&cs->free_list)) {
-		struct io_kiocb *req = list_first_entry(&cs->free_list,
+	while (!list_empty(&state->free_list)) {
+		struct io_kiocb *req = list_first_entry(&state->free_list,
 					struct io_kiocb, inflight_entry);
 
 		list_del(&req->inflight_entry);
@@ -1950,7 +1945,7 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx)
 {
 	if (!ctx)
 		return;
-	if (ctx->submit_state.comp.nr) {
+	if (ctx->submit_state.compl_nr) {
 		mutex_lock(&ctx->uring_lock);
 		io_submit_flush_completions(ctx);
 		mutex_unlock(&ctx->uring_lock);
@@ -2147,19 +2142,19 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 		state->reqs[state->free_reqs++] = req;
 	else
-		list_add(&req->inflight_entry, &state->comp.free_list);
+		list_add(&req->inflight_entry, &state->free_list);
 }
 
 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_comp_state *cs = &ctx->submit_state.comp;
-	int i, nr = cs->nr;
+	struct io_submit_state *state = &ctx->submit_state;
+	int i, nr = state->compl_nr;
 	struct req_batch rb;
 
 	spin_lock_irq(&ctx->completion_lock);
 	for (i = 0; i < nr; i++) {
-		struct io_kiocb *req = cs->reqs[i];
+		struct io_kiocb *req = state->compl_reqs[i];
 
 		__io_cqring_fill_event(ctx, req->user_data, req->result,
 					req->compl.cflags);
@@ -2170,7 +2165,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 
 	io_init_req_batch(&rb);
 	for (i = 0; i < nr; i++) {
-		struct io_kiocb *req = cs->reqs[i];
+		struct io_kiocb *req = state->compl_reqs[i];
 
 		/* submission and completion refs */
 		if (req_ref_sub_and_test(req, 2))
@@ -2178,7 +2173,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 
 	io_req_free_batch_finish(ctx, &rb);
-	cs->nr = 0;
+	state->compl_nr = 0;
 }
 
 /*
@@ -6503,10 +6498,10 @@ static void __io_queue_sqe(struct io_kiocb *req)
 		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
 			struct io_ring_ctx *ctx = req->ctx;
-			struct io_comp_state *cs = &ctx->submit_state.comp;
+			struct io_submit_state *state = &ctx->submit_state;
 
-			cs->reqs[cs->nr++] = req;
-			if (cs->nr == ARRAY_SIZE(cs->reqs))
+			state->compl_reqs[state->compl_nr++] = req;
+			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
 				io_submit_flush_completions(ctx);
 		} else {
 			io_put_req(req);
@@ -6709,7 +6704,7 @@ static void io_submit_state_end(struct io_submit_state *state,
 {
 	if (state->link.head)
 		io_queue_sqe(state->link.head);
-	if (state->comp.nr)
+	if (state->compl_nr)
 		io_submit_flush_completions(ctx);
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);
@@ -8657,19 +8652,17 @@ static void io_req_cache_free(struct list_head *list)
 
 static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
-	struct io_submit_state *submit_state = &ctx->submit_state;
-	struct io_comp_state *cs = &ctx->submit_state.comp;
+	struct io_submit_state *state = &ctx->submit_state;
 
 	mutex_lock(&ctx->uring_lock);
 
-	if (submit_state->free_reqs) {
-		kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
-				     submit_state->reqs);
-		submit_state->free_reqs = 0;
+	if (state->free_reqs) {
+		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
+		state->free_reqs = 0;
 	}
 
-	io_flush_cached_locked_reqs(ctx, cs);
-	io_req_cache_free(&cs->free_list);
+	io_flush_cached_locked_reqs(ctx, state);
+	io_req_cache_free(&state->free_list);
 	mutex_unlock(&ctx->uring_lock);
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 27/28] io_uring: remove extra argument for overflow flush
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (25 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 26/28] io_uring: inline struct io_comp_state Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 12:04 ` [PATCH 28/28] io_uring: inline io_poll_remove_waitqs Pavel Begunkov
  2021-08-09 17:48 ` [PATCH v2 00/28] for-next patches Jens Axboe
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Unlike __io_cqring_overflow_flush(), nobody does forced flushing with
io_cqring_overflow_flush(), so removed the argument from it.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 749c0712d98e..9070b7cbd1c3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1533,7 +1533,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	return all_flushed;
 }
 
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 {
 	bool ret = true;
 
@@ -1541,7 +1541,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		/* iopoll syncs against uring_lock, not completion_lock */
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&ctx->uring_lock);
-		ret = __io_cqring_overflow_flush(ctx, force);
+		ret = __io_cqring_overflow_flush(ctx, false);
 		if (ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_unlock(&ctx->uring_lock);
 	}
@@ -7075,7 +7075,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	int ret;
 
 	do {
-		io_cqring_overflow_flush(ctx, false);
+		io_cqring_overflow_flush(ctx);
 		if (io_cqring_events(ctx) >= min_events)
 			return 0;
 		if (!io_run_task_work())
@@ -7113,7 +7113,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		/* if we can't even flush overflow, don't wait for more */
-		if (!io_cqring_overflow_flush(ctx, false)) {
+		if (!io_cqring_overflow_flush(ctx)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -9388,7 +9388,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	 */
 	ret = 0;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		io_cqring_overflow_flush(ctx, false);
+		io_cqring_overflow_flush(ctx);
 
 		ret = -EOWNERDEAD;
 		if (unlikely(ctx->sq_data->thread == NULL))
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 28/28] io_uring: inline io_poll_remove_waitqs
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (26 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 27/28] io_uring: remove extra argument for overflow flush Pavel Begunkov
@ 2021-08-09 12:04 ` Pavel Begunkov
  2021-08-09 17:48 ` [PATCH v2 00/28] for-next patches Jens Axboe
  28 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 12:04 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Inline io_poll_remove_waitqs() into its only user and clean it up.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9070b7cbd1c3..f6fa635b3ab6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1066,7 +1066,6 @@ static void io_rsrc_put_work(struct work_struct *work);
 
 static void io_req_task_queue(struct io_kiocb *req);
 static void io_submit_flush_completions(struct io_ring_ctx *ctx);
-static bool io_poll_remove_waitqs(struct io_kiocb *req);
 static int io_req_prep_async(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
@@ -5267,34 +5266,24 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 	return do_complete;
 }
 
-static bool io_poll_remove_waitqs(struct io_kiocb *req)
+static bool io_poll_remove_one(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
+	int refs;
 	bool do_complete;
 
 	io_poll_remove_double(req);
 	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 
-	if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
-		/* non-poll requests have submit ref still */
-		req_ref_put(req);
-	}
-	return do_complete;
-}
-
-static bool io_poll_remove_one(struct io_kiocb *req)
-	__must_hold(&req->ctx->completion_lock)
-{
-	bool do_complete;
-
-	do_complete = io_poll_remove_waitqs(req);
 	if (do_complete) {
 		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 		io_commit_cqring(req->ctx);
 		req_set_fail(req);
-		io_put_req_deferred(req, 1);
-	}
 
+		/* non-poll requests have submit ref still */
+		refs = 1 + (req->opcode != IORING_OP_POLL_ADD);
+		io_put_req_deferred(req, refs);
+	}
 	return do_complete;
 }
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 21/28] io_uring: hide async dadta behind flags
  2021-08-09 12:04 ` [PATCH 21/28] io_uring: hide async dadta behind flags Pavel Begunkov
@ 2021-08-09 17:30   ` Jens Axboe
  2021-08-09 17:44     ` Pavel Begunkov
  0 siblings, 1 reply; 32+ messages in thread
From: Jens Axboe @ 2021-08-09 17:30 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 8/9/21 6:04 AM, Pavel Begunkov wrote:
> Checking flags is a bit faster and can be batched, but the main reason
> of controlling ->async_data with req->flags but not relying on NULL is
> that we safely move it now to the end of io_kiocb, where cachelines are
> rarely loaded, and use that freed space for something more hot like
> io_mapped_ubuf.

As far as I can tell, this will run into an issue with double poll:

static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 
                                 struct poll_table_struct *p)
{                                                                               
	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);   
                                                                                  
	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
}

where we store the potential extra allocation, if any, in the async_data
field. That also needs to get freed when we release this request. One
solution would be to just set REQ_F_ASYNC_DATA before calling
__io_queue_proc().

> @@ -3141,6 +3150,8 @@ static inline int io_alloc_async_data(struct io_kiocb *req)
>  {
>  	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
>  	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
> +	if (req->async_data)
> +		req->flags |= REQ_F_ASYNC_DATA;
>  	return req->async_data == NULL;
>  }

With this change, would be better to simply do:

if (req->async_data) {
	req->flags |= REQ_F_ASYNC_DATA;
	return false;
}

return true;

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 21/28] io_uring: hide async dadta behind flags
  2021-08-09 17:30   ` Jens Axboe
@ 2021-08-09 17:44     ` Pavel Begunkov
  0 siblings, 0 replies; 32+ messages in thread
From: Pavel Begunkov @ 2021-08-09 17:44 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 8/9/21 6:30 PM, Jens Axboe wrote:
> On 8/9/21 6:04 AM, Pavel Begunkov wrote:
>> Checking flags is a bit faster and can be batched, but the main reason
>> of controlling ->async_data with req->flags but not relying on NULL is
>> that we safely move it now to the end of io_kiocb, where cachelines are
>> rarely loaded, and use that freed space for something more hot like
>> io_mapped_ubuf.
> 
> As far as I can tell, this will run into an issue with double poll:
> 
> static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 
>                                  struct poll_table_struct *p)
> {                                                                               
> 	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);   
>                                                                                   
> 	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
> }
> 
> where we store the potential extra allocation, if any, in the async_data
> field. That also needs to get freed when we release this request. One
> solution would be to just set REQ_F_ASYNC_DATA before calling
> __io_queue_proc().

Indeed, good catch. It appears the end result of the bug is a leak


>> @@ -3141,6 +3150,8 @@ static inline int io_alloc_async_data(struct io_kiocb *req)
>>  {
>>  	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
>>  	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
>> +	if (req->async_data)
>> +		req->flags |= REQ_F_ASYNC_DATA;
>>  	return req->async_data == NULL;
>>  }
> 
> With this change, would be better to simply do:
> 
> if (req->async_data) {
> 	req->flags |= REQ_F_ASYNC_DATA;
> 	return false;
> }
> 
> return true;
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2 00/28] for-next patches
  2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
                   ` (27 preceding siblings ...)
  2021-08-09 12:04 ` [PATCH 28/28] io_uring: inline io_poll_remove_waitqs Pavel Begunkov
@ 2021-08-09 17:48 ` Jens Axboe
  28 siblings, 0 replies; 32+ messages in thread
From: Jens Axboe @ 2021-08-09 17:48 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 8/9/21 6:04 AM, Pavel Begunkov wrote:
> 1,2 -- optimisation with kvmalloc()'ing file tables
> 4 -- optimise prep_rw() still touching inode with !REG fixed files
> 11 -- a small CQ waiting optimisation
> 20 -- put_task optimisation, saves atomics in many cases
> 23 -- helps req alloc sustainability, also needed for futures features
> 
> All others are cleanups, where 6-28 are resends.

Applied 1-20, stopped at 21 as that needs a re-spin.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2021-08-09 17:48 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-08-09 12:04 [PATCH v2 00/28] for-next patches Pavel Begunkov
2021-08-09 12:04 ` [PATCH 01/28] io_uring: use kvmalloc for fixed files Pavel Begunkov
2021-08-09 12:04 ` [PATCH 02/28] io_uring: inline fixed part of io_file_get() Pavel Begunkov
2021-08-09 12:04 ` [PATCH 03/28] io_uring: rename io_file_supports_async() Pavel Begunkov
2021-08-09 12:04 ` [PATCH 04/28] io_uring: avoid touching inode in rw prep Pavel Begunkov
2021-08-09 12:04 ` [PATCH 05/28] io_uring: clean io-wq callbacks Pavel Begunkov
2021-08-09 12:04 ` [PATCH 06/28] io_uring: remove unnecessary PF_EXITING check Pavel Begunkov
2021-08-09 12:04 ` [PATCH 07/28] io-wq: improve wq_list_add_tail() Pavel Begunkov
2021-08-09 12:04 ` [PATCH 08/28] io_uring: refactor io_alloc_req Pavel Begunkov
2021-08-09 12:04 ` [PATCH 09/28] io_uring: don't halt iopoll too early Pavel Begunkov
2021-08-09 12:04 ` [PATCH 10/28] io_uring: add more locking annotations for submit Pavel Begunkov
2021-08-09 12:04 ` [PATCH 11/28] io_uring: optimise io_cqring_wait() hot path Pavel Begunkov
2021-08-09 12:04 ` [PATCH 12/28] io_uring: extract a helper for ctx quiesce Pavel Begunkov
2021-08-09 12:04 ` [PATCH 13/28] io_uring: move io_put_task() definition Pavel Begunkov
2021-08-09 12:04 ` [PATCH 14/28] io_uring: move io_rsrc_node_alloc() definition Pavel Begunkov
2021-08-09 12:04 ` [PATCH 15/28] io_uring: inline io_free_req_deferred Pavel Begunkov
2021-08-09 12:04 ` [PATCH 16/28] io_uring: deduplicate open iopoll check Pavel Begunkov
2021-08-09 12:04 ` [PATCH 17/28] io_uring: improve ctx hang handling Pavel Begunkov
2021-08-09 12:04 ` [PATCH 18/28] io_uring: kill unused IO_IOPOLL_BATCH Pavel Begunkov
2021-08-09 12:04 ` [PATCH 19/28] io_uring: drop exec checks from io_req_task_submit Pavel Begunkov
2021-08-09 12:04 ` [PATCH 20/28] io_uring: optimise putting task struct Pavel Begunkov
2021-08-09 12:04 ` [PATCH 21/28] io_uring: hide async dadta behind flags Pavel Begunkov
2021-08-09 17:30   ` Jens Axboe
2021-08-09 17:44     ` Pavel Begunkov
2021-08-09 12:04 ` [PATCH 22/28] io_uring: move io_fallback_req_func() Pavel Begunkov
2021-08-09 12:04 ` [PATCH 23/28] io_uring: cache __io_free_req()'d requests Pavel Begunkov
2021-08-09 12:04 ` [PATCH 24/28] io_uring: remove redundant args from cache_free Pavel Begunkov
2021-08-09 12:04 ` [PATCH 25/28] io_uring: use inflight_entry instead of compl.list Pavel Begunkov
2021-08-09 12:04 ` [PATCH 26/28] io_uring: inline struct io_comp_state Pavel Begunkov
2021-08-09 12:04 ` [PATCH 27/28] io_uring: remove extra argument for overflow flush Pavel Begunkov
2021-08-09 12:04 ` [PATCH 28/28] io_uring: inline io_poll_remove_waitqs Pavel Begunkov
2021-08-09 17:48 ` [PATCH v2 00/28] for-next patches Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox