public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH 0/5] skip request refcounting
@ 2021-08-10 12:05 Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 1/5] io_uring: move req_ref_get() and friends Pavel Begunkov
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 12:05 UTC (permalink / raw)
  To: Jens Axboe, io-uring

With some tricks, we can avoid refcounting in most of the cases and
so save on atomics. The series implements this optimisation. 1-4 are
simple enough preparations, the biggest part is 5/5. Would be great
to have extra pair of eyes on it.

Jens tried out a prototype before, apparently it gave ~3% win for
the default read test. Not much has changed since then, so I'd
expect same result, and also hope that it should be of even greater
benefit to multithreaded workloads.

Pavel Begunkov (5):
  io_uring: move req_ref_get() and friends
  io_uring: delay freeing ->async_data
  io_uring: protect rsrc dealloc by uring_lock
  io_uring: remove req_ref_sub_and_test()
  io_uring: request refcounting skipping

 fs/io_uring.c | 176 +++++++++++++++++++++++++++++---------------------
 1 file changed, 101 insertions(+), 75 deletions(-)

-- 
2.32.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/5] io_uring: move req_ref_get() and friends
  2021-08-10 12:05 [PATCH 0/5] skip request refcounting Pavel Begunkov
@ 2021-08-10 12:05 ` Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 2/5] io_uring: delay freeing ->async_data Pavel Begunkov
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 12:05 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Move all request refcount helpers to avoid forward declarations in the
future.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 70 +++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3b17fce0b7e3..e7dabbe885b3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1081,6 +1081,41 @@ EXPORT_SYMBOL(io_uring_get_socket);
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
+/*
+ * Shamelessly stolen from the mm implementation of page reference checking,
+ * see commit f958d7b528b1 for details.
+ */
+#define req_ref_zero_or_close_to_overflow(req)	\
+	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+{
+	return atomic_inc_not_zero(&req->refs);
+}
+
+static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_sub_and_test(refs, &req->refs);
+}
+
+static inline bool req_ref_put_and_test(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	return atomic_dec_and_test(&req->refs);
+}
+
+static inline void req_ref_put(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req_ref_put_and_test(req));
+}
+
+static inline void req_ref_get(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+	atomic_inc(&req->refs);
+}
+
 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1539,41 +1574,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-/*
- * Shamelessly stolen from the mm implementation of page reference checking,
- * see commit f958d7b528b1 for details.
- */
-#define req_ref_zero_or_close_to_overflow(req)	\
-	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
-
-static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
-{
-	return atomic_inc_not_zero(&req->refs);
-}
-
-static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_sub_and_test(refs, &req->refs);
-}
-
-static inline bool req_ref_put_and_test(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_dec_and_test(&req->refs);
-}
-
-static inline void req_ref_put(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(req_ref_put_and_test(req));
-}
-
-static inline void req_ref_get(struct io_kiocb *req)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	atomic_inc(&req->refs);
-}
-
 /* must to be called somewhat shortly after putting a request */
 static inline void io_put_task(struct task_struct *task, int nr)
 {
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/5] io_uring: delay freeing ->async_data
  2021-08-10 12:05 [PATCH 0/5] skip request refcounting Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 1/5] io_uring: move req_ref_get() and friends Pavel Begunkov
@ 2021-08-10 12:05 ` Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 3/5] io_uring: protect rsrc dealloc by uring_lock Pavel Begunkov
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 12:05 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Prepare for upcoming removal of submission references by delaying
->async_data deallocation, preventing kfree()'ing it up until
io_issue_sqe() returns.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e7dabbe885b3..8ca9895535dd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1771,6 +1771,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+	struct io_kiocb *req;
 	int ret, i;
 
 	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
@@ -1796,8 +1797,18 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 		io_preinit_req(state->reqs[i], ctx);
 	state->free_reqs = ret;
 got_req:
-	state->free_reqs--;
-	return state->reqs[state->free_reqs];
+	req = state->reqs[--state->free_reqs];
+
+	/*
+	 * io_req_free(), dismantle() and co. don't free ->async_data, that's
+	 * needed to prevent io_issue_sqe() from kfree'ing the memory somewhere
+	 * deep down the stack and accessing it afterwards.
+	 */
+	if (req->async_data) {
+		kfree(req->async_data);
+		req->async_data = NULL;
+	}
+	return req;
 }
 
 static inline void io_put_file(struct file *file)
@@ -1816,10 +1827,6 @@ static void io_dismantle_req(struct io_kiocb *req)
 		io_put_file(req->file);
 	if (req->fixed_rsrc_refs)
 		percpu_ref_put(req->fixed_rsrc_refs);
-	if (req->async_data) {
-		kfree(req->async_data);
-		req->async_data = NULL;
-	}
 }
 
 static void __io_free_req(struct io_kiocb *req)
@@ -8614,6 +8621,8 @@ static void io_req_cache_free(struct list_head *list)
 
 	list_for_each_entry_safe(req, nxt, list, inflight_entry) {
 		list_del(&req->inflight_entry);
+		/* see comment in io_alloc_req() */
+		kfree(req->async_data);
 		kmem_cache_free(req_cachep, req);
 	}
 }
@@ -8621,9 +8630,16 @@ static void io_req_cache_free(struct list_head *list)
 static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
+	struct io_kiocb *req;
+	int i;
 
 	mutex_lock(&ctx->uring_lock);
 
+	/* see comment in io_alloc_req() */
+	for (i = 0; i < state->free_reqs; i++) {
+		req = state->reqs[i];
+		kfree(req->async_data);
+	}
 	if (state->free_reqs) {
 		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
 		state->free_reqs = 0;
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/5] io_uring: protect rsrc dealloc by uring_lock
  2021-08-10 12:05 [PATCH 0/5] skip request refcounting Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 1/5] io_uring: move req_ref_get() and friends Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 2/5] io_uring: delay freeing ->async_data Pavel Begunkov
@ 2021-08-10 12:05 ` Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 4/5] io_uring: remove req_ref_sub_and_test() Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 5/5] io_uring: request refcounting skipping Pavel Begunkov
  4 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 12:05 UTC (permalink / raw)
  To: Jens Axboe, io-uring

As with ->async_data, also protect all rsrc deallocation by uring_lock,
so when we do refcount avoidance they're not removed unexpectedly awhile
someone is still accessing them.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8ca9895535dd..cd3a1058f657 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7664,25 +7664,24 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 	struct io_ring_ctx *ctx = rsrc_data->ctx;
 	struct io_rsrc_put *prsrc, *tmp;
 
+	/* rsrc deallocation must be protected by ->uring_lock */
+	mutex_lock(&ctx->uring_lock);
 	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 		list_del(&prsrc->list);
 
 		if (prsrc->tag) {
-			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
-
-			io_ring_submit_lock(ctx, lock_ring);
 			spin_lock_irq(&ctx->completion_lock);
 			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
 			ctx->cq_extra++;
 			io_commit_cqring(ctx);
 			spin_unlock_irq(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
-			io_ring_submit_unlock(ctx, lock_ring);
 		}
 
 		rsrc_data->do_put(ctx, prsrc);
 		kfree(prsrc);
 	}
+	mutex_unlock(&ctx->uring_lock);
 
 	io_rsrc_node_destroy(ref_node);
 	if (atomic_dec_and_test(&rsrc_data->refs))
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/5] io_uring: remove req_ref_sub_and_test()
  2021-08-10 12:05 [PATCH 0/5] skip request refcounting Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-08-10 12:05 ` [PATCH 3/5] io_uring: protect rsrc dealloc by uring_lock Pavel Begunkov
@ 2021-08-10 12:05 ` Pavel Begunkov
  2021-08-10 12:05 ` [PATCH 5/5] io_uring: request refcounting skipping Pavel Begunkov
  4 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 12:05 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Soon, we won't need to put several references at once, remove
req_ref_sub_and_test() and @nr argument from io_put_req_deferred(),
and put the rest of the references by hand.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cd3a1058f657..5a08cc967199 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1043,7 +1043,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 				 long res, unsigned int cflags);
 static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req, int nr);
+static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -1093,12 +1093,6 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 	return atomic_inc_not_zero(&req->refs);
 }
 
-static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
-{
-	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
-	return atomic_sub_and_test(refs, &req->refs);
-}
-
 static inline bool req_ref_put_and_test(struct io_kiocb *req)
 {
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
@@ -1376,7 +1370,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
 		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
-		io_put_req_deferred(req, 1);
+		io_put_req_deferred(req);
 	}
 }
 
@@ -1870,7 +1864,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			io_cqring_fill_event(link->ctx, link->user_data,
 					     -ECANCELED, 0);
-			io_put_req_deferred(link, 1);
+			io_put_req_deferred(link);
 			return true;
 		}
 	}
@@ -1889,7 +1883,9 @@ static void io_fail_links(struct io_kiocb *req)
 
 		trace_io_uring_fail_link(req, link);
 		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
-		io_put_req_deferred(link, 2);
+
+		io_put_req(link);
+		io_put_req_deferred(link);
 		link = nxt;
 	}
 }
@@ -2171,7 +2167,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = state->compl_reqs[i];
 
 		/* submission and completion refs */
-		if (req_ref_sub_and_test(req, 2))
+		io_put_req(req);
+		if (req_ref_put_and_test(req))
 			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
 
@@ -2200,9 +2197,9 @@ static inline void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
+static inline void io_put_req_deferred(struct io_kiocb *req)
 {
-	if (req_ref_sub_and_test(req, refs)) {
+	if (req_ref_put_and_test(req)) {
 		req->io_task_work.func = io_free_req;
 		io_req_task_work_add(req);
 	}
@@ -5261,7 +5258,6 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
 static bool io_poll_remove_one(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
-	int refs;
 	bool do_complete;
 
 	io_poll_remove_double(req);
@@ -5273,8 +5269,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		req_set_fail(req);
 
 		/* non-poll requests have submit ref still */
-		refs = 1 + (req->opcode != IORING_OP_POLL_ADD);
-		io_put_req_deferred(req, refs);
+		if (req->opcode != IORING_OP_POLL_ADD)
+			io_put_req(req);
+		io_put_req_deferred(req);
 	}
 	return do_complete;
 }
@@ -5565,7 +5562,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 
 	req_set_fail(req);
 	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
-	io_put_req_deferred(req, 1);
+	io_put_req_deferred(req);
 	return 0;
 }
 
@@ -6416,8 +6413,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
 	if (prev) {
 		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
-		io_put_req_deferred(prev, 1);
-		io_put_req_deferred(req, 1);
+		io_put_req_deferred(prev);
+		io_put_req_deferred(req);
 	} else {
 		io_req_complete_post(req, -ETIME, 0);
 	}
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/5] io_uring: request refcounting skipping
  2021-08-10 12:05 [PATCH 0/5] skip request refcounting Pavel Begunkov
                   ` (3 preceding siblings ...)
  2021-08-10 12:05 ` [PATCH 4/5] io_uring: remove req_ref_sub_and_test() Pavel Begunkov
@ 2021-08-10 12:05 ` Pavel Begunkov
  2021-08-10 13:30   ` Pavel Begunkov
  4 siblings, 1 reply; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 12:05 UTC (permalink / raw)
  To: Jens Axboe, io-uring

The intention here is to skip request refcounting in most hot cases,
and use only when it's really needed. So, it should save 2 refcount
atomics per request for IOPOLL and IRQ completions, and 1 atomic per
req for inline completions.

There are cases, however, where the refcounting is enabled back:
- Polling, including apoll. Because double poll entries takes a ref.
  Might get relaxed in the near future.
- Link timeouts, for both, the timeout and the request it's bound to,
  because they work in-parallel and we need to synchronise to cancel one
  of them on completion.
- When in io-wq, because it doesn't hold uring_lock, so the guarantees
  described below doesn't work, and there is also io_wq_free_work().

TL;DR;
Requests were always given with two references. One is called completion
and is generally put at the moment I/O is completed. The second is
submission reference, which is usually put during submission, e.g. by
__io_queue_sqe(). It was needed, because the code actually issuing a
request (e.g. fs, the block layer, etc.), may punt it for async
execution, but still access associated memory while unwinding back to
io_uring.

First, let's notice that now all requests are returned back into the
request cache and not actually kfreed(). Also, we take requests out of
it only under ->uring_lock. So, if I/O submission is also protected by
the mutex, and even if io_issue_sqe() completes a request deep down the
stack, the memory is not going to be freed and the request won't be
re-allocated until the submission stack unwinds back and we unlock
the mutex. If it's not protected by the mutex, we're in io-wq, which
takes a reference anyway resembling submission referencing.

Same with other parts that may get accessed on the way back,
->async_data deallocation is delayed up to io_alloc_req(), and
rsrc deallocation is also now protected by the mutex.

With all that we can kill off submission references. And next, in most
cases we have only one reference travelling along the execution path,
so we can replace it with a flag allowing to enable refcounting when
needed.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5a08cc967199..d65621247709 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -712,6 +712,7 @@ enum {
 	REQ_F_REISSUE_BIT,
 	REQ_F_DONT_REISSUE_BIT,
 	REQ_F_CREDS_BIT,
+	REQ_F_REFCOUNT_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_NOWAIT_READ_BIT,
 	REQ_F_NOWAIT_WRITE_BIT,
@@ -767,6 +768,8 @@ enum {
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* has creds assigned */
 	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
+	/* skip refcounting if not set */
+	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
 };
 
 struct async_poll {
@@ -1090,26 +1093,40 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 {
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 	return atomic_inc_not_zero(&req->refs);
 }
 
 static inline bool req_ref_put_and_test(struct io_kiocb *req)
 {
+	if (likely(!(req->flags & REQ_F_REFCOUNT)))
+		return true;
+
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 	return atomic_dec_and_test(&req->refs);
 }
 
 static inline void req_ref_put(struct io_kiocb *req)
 {
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 	WARN_ON_ONCE(req_ref_put_and_test(req));
 }
 
 static inline void req_ref_get(struct io_kiocb *req)
 {
+	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 	atomic_inc(&req->refs);
 }
 
+static inline void io_req_refcount(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_REFCOUNT)) {
+		req->flags |= REQ_F_REFCOUNT;
+		atomic_set(&req->refs, 1);
+	}
+}
+
 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1706,7 +1723,6 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
 static void io_req_complete_failed(struct io_kiocb *req, long res)
 {
 	req_set_fail(req);
-	io_put_req(req);
 	io_req_complete_post(req, res, 0);
 }
 
@@ -1761,7 +1777,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 	return nr != 0;
 }
 
+/*
+ * A request might get retired back into the request caches even before opcode
+ * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
+ * Because of that, io_alloc_req() should be called only under ->uring_lock
+ * and with extra caution to not get a request that is still worked on.
+ */
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+	__must_hold(&req->ctx->uring_lock)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
@@ -1883,8 +1906,6 @@ static void io_fail_links(struct io_kiocb *req)
 
 		trace_io_uring_fail_link(req, link);
 		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
-
-		io_put_req(link);
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -2166,8 +2187,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
 	for (i = 0; i < nr; i++) {
 		struct io_kiocb *req = state->compl_reqs[i];
 
-		/* submission and completion refs */
-		io_put_req(req);
 		if (req_ref_put_and_test(req))
 			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
@@ -2272,7 +2291,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
 		    !(req->flags & REQ_F_DONT_REISSUE)) {
 			req->iopoll_completed = 0;
-			req_ref_get(req);
 			io_req_task_queue_reissue(req);
 			continue;
 		}
@@ -2787,7 +2805,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 	if (check_reissue && (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
 		if (io_resubmit_prep(req)) {
-			req_ref_get(req);
 			io_req_task_queue_reissue(req);
 		} else {
 			int cflags = 0;
@@ -3213,9 +3230,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 
 	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 	list_del_init(&wait->entry);
-
-	/* submit ref gets dropped, acquire a new one */
-	req_ref_get(req);
 	io_req_task_queue(req);
 	return 1;
 }
@@ -5220,6 +5234,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
 	req->apoll = apoll;
 	req->flags |= REQ_F_POLLED;
 	ipt.pt._qproc = io_async_queue_proc;
+	io_req_refcount(req);
 
 	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 					io_async_wake);
@@ -5267,10 +5282,6 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 		io_commit_cqring(req->ctx);
 		req_set_fail(req);
-
-		/* non-poll requests have submit ref still */
-		if (req->opcode != IORING_OP_POLL_ADD)
-			io_put_req(req);
 		io_put_req_deferred(req);
 	}
 	return do_complete;
@@ -5414,6 +5425,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
+	io_req_refcount(req);
 	poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }
@@ -6290,6 +6302,10 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	struct io_kiocb *timeout;
 	int ret = 0;
 
+	io_req_refcount(req);
+	/* will be dropped by ->io_free_work() after returning to io-wq */
+	req_ref_get(req);
+
 	timeout = io_prep_linked_timeout(req);
 	if (timeout)
 		io_queue_linked_timeout(timeout);
@@ -6312,11 +6328,8 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	}
 
 	/* avoid locking problems by failing it from a clean context */
-	if (ret) {
-		/* io-wq is going to take one down */
-		req_ref_get(req);
+	if (ret)
 		io_req_task_queue_fail(req, ret);
-	}
 }
 
 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
@@ -6450,6 +6463,11 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	    nxt->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
+	/* linked timeouts should have two refs once prep'ed */
+	io_req_refcount(req);
+	io_req_refcount(nxt);
+	req_ref_get(nxt);
+
 	nxt->timeout.head = req;
 	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
 	req->flags |= REQ_F_LINK_TIMEOUT;
@@ -6478,8 +6496,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			state->compl_reqs[state->compl_nr++] = req;
 			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
 				io_submit_flush_completions(ctx);
-		} else {
-			io_put_req(req);
 		}
 	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		switch (io_arm_poll_handler(req)) {
@@ -6559,8 +6575,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
 	req->fixed_rsrc_refs = NULL;
-	/* one is dropped after submission, the other at completion */
-	atomic_set(&req->refs, 2);
 	req->task = current;
 
 	/* enforce forwards compatibility on users */
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 5/5] io_uring: request refcounting skipping
  2021-08-10 12:05 ` [PATCH 5/5] io_uring: request refcounting skipping Pavel Begunkov
@ 2021-08-10 13:30   ` Pavel Begunkov
  0 siblings, 0 replies; 7+ messages in thread
From: Pavel Begunkov @ 2021-08-10 13:30 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 8/10/21 1:05 PM, Pavel Begunkov wrote:
> The intention here is to skip request refcounting in most hot cases,
> and use only when it's really needed. So, it should save 2 refcount
> atomics per request for IOPOLL and IRQ completions, and 1 atomic per
> req for inline completions.
> 
> There are cases, however, where the refcounting is enabled back:
> - Polling, including apoll. Because double poll entries takes a ref.
>   Might get relaxed in the near future.
> - Link timeouts, for both, the timeout and the request it's bound to,
>   because they work in-parallel and we need to synchronise to cancel one
>   of them on completion.
> - When in io-wq, because it doesn't hold uring_lock, so the guarantees
>   described below doesn't work, and there is also io_wq_free_work().
> 
> TL;DR;
> Requests were always given with two references. One is called completion
> and is generally put at the moment I/O is completed. The second is
> submission reference, which is usually put during submission, e.g. by
> __io_queue_sqe(). It was needed, because the code actually issuing a
> request (e.g. fs, the block layer, etc.), may punt it for async
> execution, but still access associated memory while unwinding back to
> io_uring.
> 
> First, let's notice that now all requests are returned back into the
> request cache and not actually kfreed(). Also, we take requests out of
> it only under ->uring_lock. So, if I/O submission is also protected by
> the mutex, and even if io_issue_sqe() completes a request deep down the
> stack, the memory is not going to be freed and the request won't be
> re-allocated until the submission stack unwinds back and we unlock
> the mutex. If it's not protected by the mutex, we're in io-wq, which
> takes a reference anyway resembling submission referencing.

And there is still ->file that might be dropped in the middle and
other bits from io_clean_op(), so will be racy.

> 
> Same with other parts that may get accessed on the way back,
> ->async_data deallocation is delayed up to io_alloc_req(), and
> rsrc deallocation is also now protected by the mutex.
> 
> With all that we can kill off submission references. And next, in most
> cases we have only one reference travelling along the execution path,
> so we can replace it with a flag allowing to enable refcounting when
> needed.
> 
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 36 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 5a08cc967199..d65621247709 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -712,6 +712,7 @@ enum {
>  	REQ_F_REISSUE_BIT,
>  	REQ_F_DONT_REISSUE_BIT,
>  	REQ_F_CREDS_BIT,
> +	REQ_F_REFCOUNT_BIT,
>  	/* keep async read/write and isreg together and in order */
>  	REQ_F_NOWAIT_READ_BIT,
>  	REQ_F_NOWAIT_WRITE_BIT,
> @@ -767,6 +768,8 @@ enum {
>  	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
>  	/* has creds assigned */
>  	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
> +	/* skip refcounting if not set */
> +	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
>  };
>  
>  struct async_poll {
> @@ -1090,26 +1093,40 @@ EXPORT_SYMBOL(io_uring_get_socket);
>  
>  static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
>  {
> +	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
>  	return atomic_inc_not_zero(&req->refs);
>  }
>  
>  static inline bool req_ref_put_and_test(struct io_kiocb *req)
>  {
> +	if (likely(!(req->flags & REQ_F_REFCOUNT)))
> +		return true;
> +
>  	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
>  	return atomic_dec_and_test(&req->refs);
>  }
>  
>  static inline void req_ref_put(struct io_kiocb *req)
>  {
> +	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
>  	WARN_ON_ONCE(req_ref_put_and_test(req));
>  }
>  
>  static inline void req_ref_get(struct io_kiocb *req)
>  {
> +	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
>  	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
>  	atomic_inc(&req->refs);
>  }
>  
> +static inline void io_req_refcount(struct io_kiocb *req)
> +{
> +	if (!(req->flags & REQ_F_REFCOUNT)) {
> +		req->flags |= REQ_F_REFCOUNT;
> +		atomic_set(&req->refs, 1);
> +	}
> +}
> +
>  static inline void io_req_set_rsrc_node(struct io_kiocb *req)
>  {
>  	struct io_ring_ctx *ctx = req->ctx;
> @@ -1706,7 +1723,6 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
>  static void io_req_complete_failed(struct io_kiocb *req, long res)
>  {
>  	req_set_fail(req);
> -	io_put_req(req);
>  	io_req_complete_post(req, res, 0);
>  }
>  
> @@ -1761,7 +1777,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
>  	return nr != 0;
>  }
>  
> +/*
> + * A request might get retired back into the request caches even before opcode
> + * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
> + * Because of that, io_alloc_req() should be called only under ->uring_lock
> + * and with extra caution to not get a request that is still worked on.
> + */
>  static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
> +	__must_hold(&req->ctx->uring_lock)
>  {
>  	struct io_submit_state *state = &ctx->submit_state;
>  	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
> @@ -1883,8 +1906,6 @@ static void io_fail_links(struct io_kiocb *req)
>  
>  		trace_io_uring_fail_link(req, link);
>  		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
> -
> -		io_put_req(link);
>  		io_put_req_deferred(link);
>  		link = nxt;
>  	}
> @@ -2166,8 +2187,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx)
>  	for (i = 0; i < nr; i++) {
>  		struct io_kiocb *req = state->compl_reqs[i];
>  
> -		/* submission and completion refs */
> -		io_put_req(req);
>  		if (req_ref_put_and_test(req))
>  			io_req_free_batch(&rb, req, &ctx->submit_state);
>  	}
> @@ -2272,7 +2291,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>  		if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
>  		    !(req->flags & REQ_F_DONT_REISSUE)) {
>  			req->iopoll_completed = 0;
> -			req_ref_get(req);
>  			io_req_task_queue_reissue(req);
>  			continue;
>  		}
> @@ -2787,7 +2805,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
>  	if (check_reissue && (req->flags & REQ_F_REISSUE)) {
>  		req->flags &= ~REQ_F_REISSUE;
>  		if (io_resubmit_prep(req)) {
> -			req_ref_get(req);
>  			io_req_task_queue_reissue(req);
>  		} else {
>  			int cflags = 0;
> @@ -3213,9 +3230,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
>  
>  	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
>  	list_del_init(&wait->entry);
> -
> -	/* submit ref gets dropped, acquire a new one */
> -	req_ref_get(req);
>  	io_req_task_queue(req);
>  	return 1;
>  }
> @@ -5220,6 +5234,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
>  	req->apoll = apoll;
>  	req->flags |= REQ_F_POLLED;
>  	ipt.pt._qproc = io_async_queue_proc;
> +	io_req_refcount(req);
>  
>  	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
>  					io_async_wake);
> @@ -5267,10 +5282,6 @@ static bool io_poll_remove_one(struct io_kiocb *req)
>  		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
>  		io_commit_cqring(req->ctx);
>  		req_set_fail(req);
> -
> -		/* non-poll requests have submit ref still */
> -		if (req->opcode != IORING_OP_POLL_ADD)
> -			io_put_req(req);
>  		io_put_req_deferred(req);
>  	}
>  	return do_complete;
> @@ -5414,6 +5425,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
>  	if (flags & ~IORING_POLL_ADD_MULTI)
>  		return -EINVAL;
>  
> +	io_req_refcount(req);
>  	poll->events = io_poll_parse_events(sqe, flags);
>  	return 0;
>  }
> @@ -6290,6 +6302,10 @@ static void io_wq_submit_work(struct io_wq_work *work)
>  	struct io_kiocb *timeout;
>  	int ret = 0;
>  
> +	io_req_refcount(req);
> +	/* will be dropped by ->io_free_work() after returning to io-wq */
> +	req_ref_get(req);
> +
>  	timeout = io_prep_linked_timeout(req);
>  	if (timeout)
>  		io_queue_linked_timeout(timeout);
> @@ -6312,11 +6328,8 @@ static void io_wq_submit_work(struct io_wq_work *work)
>  	}
>  
>  	/* avoid locking problems by failing it from a clean context */
> -	if (ret) {
> -		/* io-wq is going to take one down */
> -		req_ref_get(req);
> +	if (ret)
>  		io_req_task_queue_fail(req, ret);
> -	}
>  }
>  
>  static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
> @@ -6450,6 +6463,11 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
>  	    nxt->opcode != IORING_OP_LINK_TIMEOUT)
>  		return NULL;
>  
> +	/* linked timeouts should have two refs once prep'ed */
> +	io_req_refcount(req);
> +	io_req_refcount(nxt);
> +	req_ref_get(nxt);
> +
>  	nxt->timeout.head = req;
>  	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
>  	req->flags |= REQ_F_LINK_TIMEOUT;
> @@ -6478,8 +6496,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
>  			state->compl_reqs[state->compl_nr++] = req;
>  			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
>  				io_submit_flush_completions(ctx);
> -		} else {
> -			io_put_req(req);
>  		}
>  	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
>  		switch (io_arm_poll_handler(req)) {
> @@ -6559,8 +6575,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>  	req->user_data = READ_ONCE(sqe->user_data);
>  	req->file = NULL;
>  	req->fixed_rsrc_refs = NULL;
> -	/* one is dropped after submission, the other at completion */
> -	atomic_set(&req->refs, 2);
>  	req->task = current;
>  
>  	/* enforce forwards compatibility on users */
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-08-10 13:31 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-08-10 12:05 [PATCH 0/5] skip request refcounting Pavel Begunkov
2021-08-10 12:05 ` [PATCH 1/5] io_uring: move req_ref_get() and friends Pavel Begunkov
2021-08-10 12:05 ` [PATCH 2/5] io_uring: delay freeing ->async_data Pavel Begunkov
2021-08-10 12:05 ` [PATCH 3/5] io_uring: protect rsrc dealloc by uring_lock Pavel Begunkov
2021-08-10 12:05 ` [PATCH 4/5] io_uring: remove req_ref_sub_and_test() Pavel Begunkov
2021-08-10 12:05 ` [PATCH 5/5] io_uring: request refcounting skipping Pavel Begunkov
2021-08-10 13:30   ` Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox