public inbox for [email protected]
 help / color / mirror / Atom feed
* [RFC][PATCH 0/3] allow to skip CQE posting
@ 2021-09-11 13:51 Pavel Begunkov
  2021-09-11 13:52 ` [PATCH 1/3] io_uring: clean cqe filling functions Pavel Begunkov
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-11 13:51 UTC (permalink / raw)
  To: Jens Axboe, io-uring

It's expensive enough to post an CQE, and there are other
reasons to want to ignore them, e.g. for link handling and
it may just be more convenient for the userspace.

Try to cover most of the use cases with one flag. The overhead
is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
requests and a bit bloated req_set_fail(), should be bearable.

See 2/3 for the actual description of the flag.

Pavel Begunkov (3):
  io_uring: clean cqe filling functions
  io_uring: add option to skip CQE posting
  io_uring: don't spinlock when not posting CQEs

 fs/io_uring.c                 | 103 ++++++++++++++++++++++------------
 include/uapi/linux/io_uring.h |   3 +
 2 files changed, 70 insertions(+), 36 deletions(-)

-- 
2.33.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] io_uring: clean cqe filling functions
  2021-09-11 13:51 [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
@ 2021-09-11 13:52 ` Pavel Begunkov
  2021-09-12 18:24   ` Hao Xu
  2021-09-11 13:52 ` [PATCH 2/3] io_uring: add option to skip CQE posting Pavel Begunkov
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-11 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Split io_cqring_fill_event() into a couple of more targeted functions.
The first on is io_fill_cqe_aux() for completions that are not
associated with request completions and doing the ->cq_extra accounting.
Examples are additional CQEs from multishot poll and rsrc notifications.

The second is io_fill_cqe_req(), should be called when it's a normal
request completion. Nothing more to it at the moment, will be used in
later patches.

The last one is inlined __io_fill_cqe() for a finer grained control,
should be used with caution and in hottest places.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 58 ++++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e6ccdae189b0..1703130ae8df 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1078,8 +1078,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all);
 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 
-static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-				 long res, unsigned int cflags);
+static void io_fill_cqe_req(struct io_kiocb *req, long res, unsigned int cflags);
+
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req);
 static void io_dismantle_req(struct io_kiocb *req);
@@ -1491,7 +1491,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
-		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
+		io_fill_cqe_req(req, status, 0);
 		io_put_req_deferred(req);
 	}
 }
@@ -1760,8 +1760,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	return true;
 }
 
-static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-					  long res, unsigned int cflags)
+static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
+				 long res, unsigned int cflags)
 {
 	struct io_uring_cqe *cqe;
 
@@ -1782,11 +1782,17 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
 	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 }
 
-/* not as hot to bloat with inlining */
-static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
-					  long res, unsigned int cflags)
+static noinline void io_fill_cqe_req(struct io_kiocb *req, long res,
+				     unsigned int cflags)
+{
+	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
+}
+
+static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
+				     long res, unsigned int cflags)
 {
-	return __io_cqring_fill_event(ctx, user_data, res, cflags);
+	ctx->cq_extra++;
+	return __io_fill_cqe(ctx, user_data, res, cflags);
 }
 
 static void io_req_complete_post(struct io_kiocb *req, long res,
@@ -1795,7 +1801,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock(&ctx->completion_lock);
-	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
+	__io_fill_cqe(ctx, req->user_data, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -2021,8 +2027,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			list_del(&link->timeout.list);
-			io_cqring_fill_event(link->ctx, link->user_data,
-					     -ECANCELED, 0);
+			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			return true;
 		}
@@ -2046,7 +2051,7 @@ static void io_fail_links(struct io_kiocb *req)
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_cqring_fill_event(link->ctx, link->user_data, res, 0);
+		io_fill_cqe_req(link, res, 0);
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -2063,8 +2068,7 @@ static bool io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
-			io_cqring_fill_event(link->ctx, link->user_data,
-					     -ECANCELED, 0);
+			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			posted = true;
 		}
@@ -2335,8 +2339,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	for (i = 0; i < nr; i++) {
 		struct io_kiocb *req = state->compl_reqs[i];
 
-		__io_cqring_fill_event(ctx, req->user_data, req->result,
-					req->compl.cflags);
+		__io_fill_cqe(ctx, req->user_data, req->result,
+			      req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
@@ -2454,8 +2458,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			continue;
 		}
 
-		__io_cqring_fill_event(ctx, req->user_data, req->result,
-					io_put_rw_kbuf(req));
+		__io_fill_cqe(ctx, req->user_data, req->result,
+			      io_put_rw_kbuf(req));
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
@@ -5293,13 +5297,12 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
-	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
+	if (!(flags & IORING_CQE_F_MORE)) {
+		io_fill_cqe_req(req, error, flags);
+	} else if (!io_fill_cqe_aux(ctx, req->user_data, error, flags)) {
 		req->poll.done = true;
 		flags = 0;
 	}
-	if (flags & IORING_CQE_F_MORE)
-		ctx->cq_extra++;
-
 	return !(flags & IORING_CQE_F_MORE);
 }
 
@@ -5627,9 +5630,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 
 	if (do_complete) {
-		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
-		io_commit_cqring(req->ctx);
 		req_set_fail(req);
+		io_fill_cqe_req(req, -ECANCELED, 0);
+		io_commit_cqring(req->ctx);
 		io_put_req_deferred(req);
 	}
 	return do_complete;
@@ -5924,7 +5927,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 		return PTR_ERR(req);
 
 	req_set_fail(req);
-	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
+	io_fill_cqe_req(req, -ECANCELED, 0);
 	io_put_req_deferred(req);
 	return 0;
 }
@@ -8122,8 +8125,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 
 			io_ring_submit_lock(ctx, lock_ring);
 			spin_lock(&ctx->completion_lock);
-			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
-			ctx->cq_extra++;
+			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
 			io_commit_cqring(ctx);
 			spin_unlock(&ctx->completion_lock);
 			io_cqring_ev_posted(ctx);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] io_uring: add option to skip CQE posting
  2021-09-11 13:51 [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
  2021-09-11 13:52 ` [PATCH 1/3] io_uring: clean cqe filling functions Pavel Begunkov
@ 2021-09-11 13:52 ` Pavel Begunkov
  2021-09-11 13:52 ` [PATCH 3/3] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
  2021-09-12 23:49 ` [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
  3 siblings, 0 replies; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-11 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Emitting a CQE is expensive from the kernel perspective. Often, it's
also not convenient for the userspace, spends some cycles on processing
and just complicates the logic. A similar problems goes for linked
requests, where we post an CQE for each request in the link.

Introduce a new flags, IOSQE_CQE_SKIP_SUCCESS, trying to help with it.
When set and a request completed successfully, it won't generate a CQE.
When fails, it produces an CQE, but all following linked requests will
be CQE-less, regardless whether they have IOSQE_CQE_SKIP_SUCCESS or not.
The notion of "fail" is the same as for link failing-cancellation, where
it's opcode dependent, and _usually_ result >= 0 is a success, but not
always.

Linked timeouts are a bit special. When the requests it's linked to was
not attempted to be executed, e.g. failing linked requests, it follows
the description above. Otherwise, whether a linked timeout will post a
completion or not solely depends on IOSQE_CQE_SKIP_SUCCESS of that
linked timeout request. Linked timeout never "fail" during execution, so
for them it's unconditional. It's expected for users to not really care
about the result of it but rely solely on the result of the master
request. Another reason for such a treatment is that it's racy, and the
timeout callback may be running awhile the master request posts its
completion.

use case 1:
If one doesn't care about results of some requests, e.g. normal
timeouts, just set IOSQE_CQE_SKIP_SUCCESS. Error result will still be
posted and need to be handled.

use case 2:
Set IOSQE_CQE_SKIP_SUCCESS for all requests of a link but the last,
and it'll post a completion only for the last one if everything goes
right, otherwise there will be one only one CQE for the first failed
request.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c                 | 41 +++++++++++++++++++++++++++--------
 include/uapi/linux/io_uring.h |  3 +++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1703130ae8df..172c857e8b3f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -104,8 +104,9 @@
 #define IORING_MAX_REG_BUFFERS	(1U << 14)
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
-				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
-				IOSQE_BUFFER_SELECT)
+			IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+			IOSQE_BUFFER_SELECT | IOSQE_CQE_SKIP_SUCCESS)
+
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
 
@@ -722,6 +723,7 @@ enum {
 	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
 	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
+	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
 
 	/* first byte is taken by user flags, shift it to not overlap */
 	REQ_F_FAIL_BIT		= 8,
@@ -738,6 +740,7 @@ enum {
 	REQ_F_CREDS_BIT,
 	REQ_F_REFCOUNT_BIT,
 	REQ_F_ARM_LTIMEOUT_BIT,
+	REQ_F_SKIP_LINK_CQES_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_NOWAIT_READ_BIT,
 	REQ_F_NOWAIT_WRITE_BIT,
@@ -760,6 +763,8 @@ enum {
 	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
 	/* IOSQE_BUFFER_SELECT */
 	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
+	/* IOSQE_CQE_SKIP_SUCCESS */
+	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
 
 	/* fail rest of links */
 	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
@@ -795,6 +800,8 @@ enum {
 	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
 	/* there is a linked timeout that has to be armed */
 	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
+	/* don't post CQEs while failing linked requests */
+	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
 };
 
 struct async_poll {
@@ -1225,6 +1232,10 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
 static inline void req_set_fail(struct io_kiocb *req)
 {
 	req->flags |= REQ_F_FAIL;
+	if (req->flags & REQ_F_CQE_SKIP) {
+		req->flags &= ~REQ_F_CQE_SKIP;
+		req->flags |= REQ_F_SKIP_LINK_CQES;
+	}
 }
 
 static inline void req_fail_link_node(struct io_kiocb *req, int res)
@@ -1785,7 +1796,8 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 static noinline void io_fill_cqe_req(struct io_kiocb *req, long res,
 				     unsigned int cflags)
 {
-	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe(req->ctx, req->user_data, res, cflags);
 }
 
 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
@@ -1801,7 +1813,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock(&ctx->completion_lock);
-	__io_fill_cqe(ctx, req->user_data, res, cflags);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe(ctx, req->user_data, res, cflags);
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
@@ -2027,6 +2040,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
 		link->timeout.head = NULL;
 		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 			list_del(&link->timeout.list);
+			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
 			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			return true;
@@ -2039,6 +2053,7 @@ static void io_fail_links(struct io_kiocb *req)
 	__must_hold(&req->ctx->completion_lock)
 {
 	struct io_kiocb *nxt, *link = req->link;
+	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
 
 	req->link = NULL;
 	while (link) {
@@ -2051,7 +2066,10 @@ static void io_fail_links(struct io_kiocb *req)
 		link->link = NULL;
 
 		trace_io_uring_fail_link(req, link);
-		io_fill_cqe_req(link, res, 0);
+		if (!ignore_cqes) {
+			link->flags &= ~REQ_F_CQE_SKIP;
+			io_fill_cqe_req(link, res, 0);
+		}
 		io_put_req_deferred(link);
 		link = nxt;
 	}
@@ -2068,6 +2086,7 @@ static bool io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
+			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
 			io_fill_cqe_req(link, -ECANCELED, 0);
 			io_put_req_deferred(link);
 			posted = true;
@@ -2339,8 +2358,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	for (i = 0; i < nr; i++) {
 		struct io_kiocb *req = state->compl_reqs[i];
 
-		__io_fill_cqe(ctx, req->user_data, req->result,
-			      req->compl.cflags);
+		if (!(req->flags & REQ_F_CQE_SKIP))
+			__io_fill_cqe(ctx, req->user_data, req->result,
+				      req->compl.cflags);
 	}
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
@@ -2458,8 +2478,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			continue;
 		}
 
-		__io_fill_cqe(ctx, req->user_data, req->result,
-			      io_put_rw_kbuf(req));
+		if (!(req->flags & REQ_F_CQE_SKIP))
+			__io_fill_cqe(ctx, req->user_data, req->result,
+				      io_put_rw_kbuf(req));
 		(*nr_events)++;
 
 		if (req_ref_put_and_test(req))
@@ -5775,6 +5796,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	flags = READ_ONCE(sqe->len);
 	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
+	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
+		return -EINVAL;
 
 	io_req_set_refcount(req);
 	poll->events = io_poll_parse_events(sqe, flags);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 59ef35154e3d..e521d4661db0 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -70,6 +70,7 @@ enum {
 	IOSQE_IO_HARDLINK_BIT,
 	IOSQE_ASYNC_BIT,
 	IOSQE_BUFFER_SELECT_BIT,
+	IOSQE_CQE_SKIP_SUCCESS_BIT,
 };
 
 /*
@@ -87,6 +88,8 @@ enum {
 #define IOSQE_ASYNC		(1U << IOSQE_ASYNC_BIT)
 /* select buffer from sqe->buf_group */
 #define IOSQE_BUFFER_SELECT	(1U << IOSQE_BUFFER_SELECT_BIT)
+/* don't post CQE if request succeeded */
+#define IOSQE_CQE_SKIP_SUCCESS	(1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
 
 /*
  * io_uring_setup() flags
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] io_uring: don't spinlock when not posting CQEs
  2021-09-11 13:51 [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
  2021-09-11 13:52 ` [PATCH 1/3] io_uring: clean cqe filling functions Pavel Begunkov
  2021-09-11 13:52 ` [PATCH 2/3] io_uring: add option to skip CQE posting Pavel Begunkov
@ 2021-09-11 13:52 ` Pavel Begunkov
  2021-09-11 20:12   ` Hao Xu
  2021-09-12 23:49 ` [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
  3 siblings, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-11 13:52 UTC (permalink / raw)
  To: Jens Axboe, io-uring

When no of queued for the batch completion requests need to post an CQE,
see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
commit/post.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 172c857e8b3f..8983a5a6851a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -317,6 +317,7 @@ struct io_submit_state {
 
 	bool			plug_started;
 	bool			need_plug;
+	bool			flush_cqes;
 
 	/*
 	 * Batch completion logic
@@ -1858,6 +1859,8 @@ static void io_req_complete_state(struct io_kiocb *req, long res,
 	req->result = res;
 	req->compl.cflags = cflags;
 	req->flags |= REQ_F_COMPLETE_INLINE;
+	if (!(req->flags & IOSQE_CQE_SKIP_SUCCESS))
+		req->ctx->submit_state.flush_cqes = true;
 }
 
 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
@@ -2354,17 +2357,19 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	int i, nr = state->compl_nr;
 	struct req_batch rb;
 
-	spin_lock(&ctx->completion_lock);
-	for (i = 0; i < nr; i++) {
-		struct io_kiocb *req = state->compl_reqs[i];
+	if (state->flush_cqes) {
+		spin_lock(&ctx->completion_lock);
+		for (i = 0; i < nr; i++) {
+			struct io_kiocb *req = state->compl_reqs[i];
 
-		if (!(req->flags & REQ_F_CQE_SKIP))
-			__io_fill_cqe(ctx, req->user_data, req->result,
-				      req->compl.cflags);
+			if (!(req->flags & REQ_F_CQE_SKIP))
+				__io_fill_cqe(ctx, req->user_data, req->result,
+					      req->compl.cflags);
+		}
+		io_commit_cqring(ctx);
+		spin_unlock(&ctx->completion_lock);
+		io_cqring_ev_posted(ctx);
 	}
-	io_commit_cqring(ctx);
-	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
 
 	io_init_req_batch(&rb);
 	for (i = 0; i < nr; i++) {
@@ -2376,6 +2381,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 
 	io_req_free_batch_finish(ctx, &rb);
 	state->compl_nr = 0;
+	state->flush_cqes = false;
 }
 
 /*
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] io_uring: don't spinlock when not posting CQEs
  2021-09-11 13:52 ` [PATCH 3/3] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
@ 2021-09-11 20:12   ` Hao Xu
  2021-09-11 21:10     ` Pavel Begunkov
  0 siblings, 1 reply; 11+ messages in thread
From: Hao Xu @ 2021-09-11 20:12 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

在 2021/9/11 下午9:52, Pavel Begunkov 写道:
> When no of queued for the batch completion requests need to post an CQE,
> see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
> commit/post.
> 
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>   fs/io_uring.c | 24 +++++++++++++++---------
>   1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 172c857e8b3f..8983a5a6851a 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -317,6 +317,7 @@ struct io_submit_state {
>   
>   	bool			plug_started;
>   	bool			need_plug;
> +	bool			flush_cqes;
>   
>   	/*
>   	 * Batch completion logic
> @@ -1858,6 +1859,8 @@ static void io_req_complete_state(struct io_kiocb *req, long res,
>   	req->result = res;
>   	req->compl.cflags = cflags;
>   	req->flags |= REQ_F_COMPLETE_INLINE;
> +	if (!(req->flags & IOSQE_CQE_SKIP_SUCCESS))
Haven't look into this patchset deeply, but this looks
like should be REQ_F_CQE_SKIP ?
> +		req->ctx->submit_state.flush_cqes = true;
>   }
>   
>   static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
> @@ -2354,17 +2357,19 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
>   	int i, nr = state->compl_nr;
>   	struct req_batch rb;
>   
> -	spin_lock(&ctx->completion_lock);
> -	for (i = 0; i < nr; i++) {
> -		struct io_kiocb *req = state->compl_reqs[i];
> +	if (state->flush_cqes) {
> +		spin_lock(&ctx->completion_lock);
> +		for (i = 0; i < nr; i++) {
> +			struct io_kiocb *req = state->compl_reqs[i];
>   
> -		if (!(req->flags & REQ_F_CQE_SKIP))
> -			__io_fill_cqe(ctx, req->user_data, req->result,
> -				      req->compl.cflags);
> +			if (!(req->flags & REQ_F_CQE_SKIP))
> +				__io_fill_cqe(ctx, req->user_data, req->result,
> +					      req->compl.cflags);
> +		}
> +		io_commit_cqring(ctx);
> +		spin_unlock(&ctx->completion_lock);
> +		io_cqring_ev_posted(ctx);
>   	}
> -	io_commit_cqring(ctx);
> -	spin_unlock(&ctx->completion_lock);
> -	io_cqring_ev_posted(ctx);
>   
>   	io_init_req_batch(&rb);
>   	for (i = 0; i < nr; i++) {
> @@ -2376,6 +2381,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
>   
>   	io_req_free_batch_finish(ctx, &rb);
>   	state->compl_nr = 0;
> +	state->flush_cqes = false;
>   }
>   
>   /*
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] io_uring: don't spinlock when not posting CQEs
  2021-09-11 20:12   ` Hao Xu
@ 2021-09-11 21:10     ` Pavel Begunkov
  2021-09-12 18:14       ` Hao Xu
  0 siblings, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-11 21:10 UTC (permalink / raw)
  To: Hao Xu, Jens Axboe, io-uring

On 9/11/21 9:12 PM, Hao Xu wrote:
> 在 2021/9/11 下午9:52, Pavel Begunkov 写道:
>> When no of queued for the batch completion requests need to post an CQE,
>> see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
>> commit/post.

It does what it says -- skips CQE posting on success. On failure it'd
still generate a completion. I was thinking about IOSQE_SKIP_CQE, but
I think it may be confusing.

Any other options to make it more clear?

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] io_uring: don't spinlock when not posting CQEs
  2021-09-11 21:10     ` Pavel Begunkov
@ 2021-09-12 18:14       ` Hao Xu
  2021-09-12 22:16         ` Pavel Begunkov
  0 siblings, 1 reply; 11+ messages in thread
From: Hao Xu @ 2021-09-12 18:14 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

在 2021/9/12 上午5:10, Pavel Begunkov 写道:
> On 9/11/21 9:12 PM, Hao Xu wrote:
>> 在 2021/9/11 下午9:52, Pavel Begunkov 写道:
>>> When no of queued for the batch completion requests need to post an CQE,
>>> see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
>>> commit/post.
> 
> It does what it says -- skips CQE posting on success. On failure it'd
> still generate a completion. I was thinking about IOSQE_SKIP_CQE, but
> I think it may be confusing.
I think IOSQE_CQE_SKIP_SUCCESS is clear..but we should do
req->flags & REQ_F_CQE_SKIP, rather than req->flags & IOSQE_CQE_SKIP_SUCCESS
> 
> Any other options to make it more clear?
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] io_uring: clean cqe filling functions
  2021-09-11 13:52 ` [PATCH 1/3] io_uring: clean cqe filling functions Pavel Begunkov
@ 2021-09-12 18:24   ` Hao Xu
  2021-09-12 23:20     ` Pavel Begunkov
  0 siblings, 1 reply; 11+ messages in thread
From: Hao Xu @ 2021-09-12 18:24 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring; +Cc: Joseph Qi

在 2021/9/11 下午9:52, Pavel Begunkov 写道:
> Split io_cqring_fill_event() into a couple of more targeted functions.
> The first on is io_fill_cqe_aux() for completions that are not
> associated with request completions and doing the ->cq_extra accounting.
> Examples are additional CQEs from multishot poll and rsrc notifications.
> 
> The second is io_fill_cqe_req(), should be called when it's a normal
> request completion. Nothing more to it at the moment, will be used in
> later patches.
> 
> The last one is inlined __io_fill_cqe() for a finer grained control,
> should be used with caution and in hottest places.
> 
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>   fs/io_uring.c | 58 ++++++++++++++++++++++++++-------------------------
>   1 file changed, 30 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index e6ccdae189b0..1703130ae8df 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -1078,8 +1078,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
>   					 bool cancel_all);
>   static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
>   
> -static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
> -				 long res, unsigned int cflags);
> +static void io_fill_cqe_req(struct io_kiocb *req, long res, unsigned int cflags);
> +
>   static void io_put_req(struct io_kiocb *req);
>   static void io_put_req_deferred(struct io_kiocb *req);
>   static void io_dismantle_req(struct io_kiocb *req);
> @@ -1491,7 +1491,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
>   		atomic_set(&req->ctx->cq_timeouts,
>   			atomic_read(&req->ctx->cq_timeouts) + 1);
>   		list_del_init(&req->timeout.list);
> -		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
> +		io_fill_cqe_req(req, status, 0);
>   		io_put_req_deferred(req);
>   	}
>   }
> @@ -1760,8 +1760,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
>   	return true;
>   }
>   
> -static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
> -					  long res, unsigned int cflags)
> +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
> +				 long res, unsigned int cflags)
>   {
>   	struct io_uring_cqe *cqe;
>   
> @@ -1782,11 +1782,17 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
>   	return io_cqring_event_overflow(ctx, user_data, res, cflags);
>   }
>   
> -/* not as hot to bloat with inlining */
> -static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
> -					  long res, unsigned int cflags)
> +static noinline void io_fill_cqe_req(struct io_kiocb *req, long res,
> +				     unsigned int cflags)
> +{
> +	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
> +}
> +
> +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
> +				     long res, unsigned int cflags)
>   {
> -	return __io_cqring_fill_event(ctx, user_data, res, cflags);
> +	ctx->cq_extra++;
> +	return __io_fill_cqe(ctx, user_data, res, cflags);
>   }
>   
>   static void io_req_complete_post(struct io_kiocb *req, long res,
> @@ -1795,7 +1801,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
>   	struct io_ring_ctx *ctx = req->ctx;
>   
>   	spin_lock(&ctx->completion_lock);
> -	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
> +	__io_fill_cqe(ctx, req->user_data, res, cflags);
>   	/*
>   	 * If we're the last reference to this request, add to our locked
>   	 * free_list cache.
> @@ -2021,8 +2027,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
>   		link->timeout.head = NULL;
>   		if (hrtimer_try_to_cancel(&io->timer) != -1) {
>   			list_del(&link->timeout.list);
> -			io_cqring_fill_event(link->ctx, link->user_data,
> -					     -ECANCELED, 0);
> +			io_fill_cqe_req(link, -ECANCELED, 0);
>   			io_put_req_deferred(link);
>   			return true;
>   		}
> @@ -2046,7 +2051,7 @@ static void io_fail_links(struct io_kiocb *req)
>   		link->link = NULL;
>   
>   		trace_io_uring_fail_link(req, link);
> -		io_cqring_fill_event(link->ctx, link->user_data, res, 0);
> +		io_fill_cqe_req(link, res, 0);
>   		io_put_req_deferred(link);
>   		link = nxt;
>   	}
> @@ -2063,8 +2068,7 @@ static bool io_disarm_next(struct io_kiocb *req)
>   		req->flags &= ~REQ_F_ARM_LTIMEOUT;
>   		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
>   			io_remove_next_linked(req);
> -			io_cqring_fill_event(link->ctx, link->user_data,
> -					     -ECANCELED, 0);
> +			io_fill_cqe_req(link, -ECANCELED, 0);
>   			io_put_req_deferred(link);
>   			posted = true;
>   		}
> @@ -2335,8 +2339,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
>   	for (i = 0; i < nr; i++) {
>   		struct io_kiocb *req = state->compl_reqs[i];
>   
> -		__io_cqring_fill_event(ctx, req->user_data, req->result,
> -					req->compl.cflags);
> +		__io_fill_cqe(ctx, req->user_data, req->result,
> +			      req->compl.cflags);
>   	}
>   	io_commit_cqring(ctx);
>   	spin_unlock(&ctx->completion_lock);
> @@ -2454,8 +2458,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>   			continue;
>   		}
>   
> -		__io_cqring_fill_event(ctx, req->user_data, req->result,
> -					io_put_rw_kbuf(req));
> +		__io_fill_cqe(ctx, req->user_data, req->result,
> +			      io_put_rw_kbuf(req));
>   		(*nr_events)++;
>   
>   		if (req_ref_put_and_test(req))
> @@ -5293,13 +5297,12 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
>   	}
>   	if (req->poll.events & EPOLLONESHOT)
>   		flags = 0;
> -	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
> +	if (!(flags & IORING_CQE_F_MORE)) {
> +		io_fill_cqe_req(req, error, flags);
We should check the return value of io_fill_cqe_req() and do
req->poll.done = true if the return value is false, which means ocqe
allocation failed. Though I think the current poll.done logic itself
is not right.(I've changed it in another patch)
> +	} else if (!io_fill_cqe_aux(ctx, req->user_data, error, flags)) {
>   		req->poll.done = true;
>   		flags = 0;
>   	}
> -	if (flags & IORING_CQE_F_MORE)
> -		ctx->cq_extra++;
> -
>   	return !(flags & IORING_CQE_F_MORE);
>   }
>   
> @@ -5627,9 +5630,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
>   	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
>   
>   	if (do_complete) {
> -		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
> -		io_commit_cqring(req->ctx);
>   		req_set_fail(req);
> +		io_fill_cqe_req(req, -ECANCELED, 0);
> +		io_commit_cqring(req->ctx);
>   		io_put_req_deferred(req);
>   	}
>   	return do_complete;
> @@ -5924,7 +5927,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
>   		return PTR_ERR(req);
>   
>   	req_set_fail(req);
> -	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
> +	io_fill_cqe_req(req, -ECANCELED, 0);
>   	io_put_req_deferred(req);
>   	return 0;
>   }
> @@ -8122,8 +8125,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
>   
>   			io_ring_submit_lock(ctx, lock_ring);
>   			spin_lock(&ctx->completion_lock);
> -			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
> -			ctx->cq_extra++;
> +			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
>   			io_commit_cqring(ctx);
>   			spin_unlock(&ctx->completion_lock);
>   			io_cqring_ev_posted(ctx);
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] io_uring: don't spinlock when not posting CQEs
  2021-09-12 18:14       ` Hao Xu
@ 2021-09-12 22:16         ` Pavel Begunkov
  0 siblings, 0 replies; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-12 22:16 UTC (permalink / raw)
  To: Hao Xu, Jens Axboe, io-uring

On 9/12/21 7:14 PM, Hao Xu wrote:
> 在 2021/9/12 上午5:10, Pavel Begunkov 写道:
>> On 9/11/21 9:12 PM, Hao Xu wrote:
>>> 在 2021/9/11 下午9:52, Pavel Begunkov 写道:
>>>> When no of queued for the batch completion requests need to post an CQE,
>>>> see IOSQE_CQE_SKIP_SUCCESS, avoid grabbing ->completion_lock and other
>>>> commit/post.
>>
>> It does what it says -- skips CQE posting on success. On failure it'd
>> still generate a completion. I was thinking about IOSQE_SKIP_CQE, but
>> I think it may be confusing.
> I think IOSQE_CQE_SKIP_SUCCESS is clear..but we should do
> req->flags & REQ_F_CQE_SKIP, rather than req->flags & IOSQE_CQE_SKIP_SUCCESS

Surely we should, thanks for looking. I also think I need to add a few
lines on why IOSQE_CQE_SKIP_SUCCESS turns into just REQ_F_CQE_SKIP

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] io_uring: clean cqe filling functions
  2021-09-12 18:24   ` Hao Xu
@ 2021-09-12 23:20     ` Pavel Begunkov
  0 siblings, 0 replies; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-12 23:20 UTC (permalink / raw)
  To: Hao Xu, Jens Axboe, io-uring; +Cc: Joseph Qi

On 9/12/21 7:24 PM, Hao Xu wrote:
> 在 2021/9/11 下午9:52, Pavel Begunkov 写道:
>> Split io_cqring_fill_event() into a couple of more targeted functions.
>> The first on is io_fill_cqe_aux() for completions that are not
>> associated with request completions and doing the ->cq_extra accounting.
>> Examples are additional CQEs from multishot poll and rsrc notifications.
>>
>> The second is io_fill_cqe_req(), should be called when it's a normal
>> request completion. Nothing more to it at the moment, will be used in
>> later patches.
>>
>> The last one is inlined __io_fill_cqe() for a finer grained control,
>> should be used with caution and in hottest places.
>>
>> Signed-off-by: Pavel Begunkov <[email protected]>
>> ---
[...]
>> @@ -5293,13 +5297,12 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
>>       }
>>       if (req->poll.events & EPOLLONESHOT)
>>           flags = 0;
>> -    if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
>> +    if (!(flags & IORING_CQE_F_MORE)) {
>> +        io_fill_cqe_req(req, error, flags);
> We should check the return value of io_fill_cqe_req() and do
> req->poll.done = true if the return value is false, which means ocqe
> allocation failed. Though I think the current poll.done logic itself
> is not right.(I've changed it in another patch)

.done was serving a purpose for cancellations and some other places of
convenience. There is not much difference for the machinery whether we
set it here or not, because the success case doesn't do it, and in both
cases requests will be put to end. If there is a bug it will be just
triggerable with successfully emitting a CQE.

I saw the poll fixes, will be reading through later unless Jens beats
me on that and will keep them in mind when rebasing the series.


>> +    } else if (!io_fill_cqe_aux(ctx, req->user_data, error, flags)) {
>>           req->poll.done = true;
>>           flags = 0;
>>       }
>> -    if (flags & IORING_CQE_F_MORE)
>> -        ctx->cq_extra++;
>> -
>>       return !(flags & IORING_CQE_F_MORE);
>>   }

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC][PATCH 0/3] allow to skip CQE posting
  2021-09-11 13:51 [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-09-11 13:52 ` [PATCH 3/3] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
@ 2021-09-12 23:49 ` Pavel Begunkov
  3 siblings, 0 replies; 11+ messages in thread
From: Pavel Begunkov @ 2021-09-12 23:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 9/11/21 2:51 PM, Pavel Begunkov wrote:
> It's expensive enough to post an CQE, and there are other
> reasons to want to ignore them, e.g. for link handling and
> it may just be more convenient for the userspace.

FWIW, tried with some benchmark doing QD1 buffered reads, reads
taking 60+% (memcpy/etc.) of the time and 20-25% for io_uring.
Got +6-7% to performance from the kernel side only with additional
room to make the userspace faster.

> Try to cover most of the use cases with one flag. The overhead
> is one "if (cqe->flags & IOSQE_CQE_SKIP_SUCCESS)" check per
> requests and a bit bloated req_set_fail(), should be bearable.
> 
> See 2/3 for the actual description of the flag.
> 
> Pavel Begunkov (3):
>   io_uring: clean cqe filling functions
>   io_uring: add option to skip CQE posting
>   io_uring: don't spinlock when not posting CQEs
> 
>  fs/io_uring.c                 | 103 ++++++++++++++++++++++------------
>  include/uapi/linux/io_uring.h |   3 +
>  2 files changed, 70 insertions(+), 36 deletions(-)
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-09-12 23:49 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-09-11 13:51 [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov
2021-09-11 13:52 ` [PATCH 1/3] io_uring: clean cqe filling functions Pavel Begunkov
2021-09-12 18:24   ` Hao Xu
2021-09-12 23:20     ` Pavel Begunkov
2021-09-11 13:52 ` [PATCH 2/3] io_uring: add option to skip CQE posting Pavel Begunkov
2021-09-11 13:52 ` [PATCH 3/3] io_uring: don't spinlock when not posting CQEs Pavel Begunkov
2021-09-11 20:12   ` Hao Xu
2021-09-11 21:10     ` Pavel Begunkov
2021-09-12 18:14       ` Hao Xu
2021-09-12 22:16         ` Pavel Begunkov
2021-09-12 23:49 ` [RFC][PATCH 0/3] allow to skip CQE posting Pavel Begunkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox