public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCHSET next 0/10] Various network improvements
@ 2024-02-06 16:24 Jens Axboe
  2024-02-06 16:24 ` [PATCH 01/10] io_uring/kbuf: cleanup passing back cflags Jens Axboe
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring

Hi,

This series was mostly driven by a report on how the unfair handling
of task_work can negatively impact performance as a whole.

- Improve how we recycle/put ring provided kbufs. Not related to
  the issue at hand, but it does improve that side.

- Improve how we deal with non-local task_work. Don't have unbounded
  looping, and handle them in issue order, like we do for local
  task_work.

- Decouple SQPOLL task_work from actual task_work.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 01/10] io_uring/kbuf: cleanup passing back cflags
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 02/10] io_uring: remove looping around handling traditional task_work Jens Axboe
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We have various functions calculating the CQE cflags we need to pass
back, but it's all the same everywhere. Make a number of the putting
functions void, and just have the two main helps for this, io_put_kbuf()
and io_put_kbuf_comp() calculate the actual mask and pass it back.

While at it, cleanup how we put REQ_F_BUFFER_RING buffers. Before
this change, we would call into __io_put_kbuf() only to go right back
in to the header defined functions. As clearing this type of buffer
is just re-assigning the buf_index and incrementing the head, this
is very wasteful.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/kbuf.c | 14 ++++----------
 io_uring/kbuf.h | 41 +++++++++++++++++++++++++++--------------
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 71880615bb78..ee866d646997 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -102,10 +102,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
 	return true;
 }
 
-unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
+void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
 {
-	unsigned int cflags;
-
 	/*
 	 * We can add this buffer back to two lists:
 	 *
@@ -118,21 +116,17 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
 	 * We migrate buffers from the comp_list to the issue cache list
 	 * when we need one.
 	 */
-	if (req->flags & REQ_F_BUFFER_RING) {
-		/* no buffers to recycle for this case */
-		cflags = __io_put_kbuf_list(req, NULL);
-	} else if (issue_flags & IO_URING_F_UNLOCKED) {
+	if (issue_flags & IO_URING_F_UNLOCKED) {
 		struct io_ring_ctx *ctx = req->ctx;
 
 		spin_lock(&ctx->completion_lock);
-		cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
+		__io_put_kbuf_list(req, &ctx->io_buffers_comp);
 		spin_unlock(&ctx->completion_lock);
 	} else {
 		lockdep_assert_held(&req->ctx->uring_lock);
 
-		cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
+		__io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
 	}
-	return cflags;
 }
 
 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 53dfaa71a397..f74c910b83f4 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -57,7 +57,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
 
 void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
 
-unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
@@ -108,41 +108,54 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	return false;
 }
 
-static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
-					      struct list_head *list)
+static inline void __io_put_kbuf_ring(struct io_kiocb *req)
 {
-	unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+	if (req->buf_list) {
+		req->buf_index = req->buf_list->bgid;
+		req->buf_list->head++;
+	}
+	req->flags &= ~REQ_F_BUFFER_RING;
+}
 
+static inline void __io_put_kbuf_list(struct io_kiocb *req,
+				      struct list_head *list)
+{
 	if (req->flags & REQ_F_BUFFER_RING) {
-		if (req->buf_list) {
-			req->buf_index = req->buf_list->bgid;
-			req->buf_list->head++;
-		}
-		req->flags &= ~REQ_F_BUFFER_RING;
+		__io_put_kbuf_ring(req);
 	} else {
 		req->buf_index = req->kbuf->bgid;
 		list_add(&req->kbuf->list, list);
 		req->flags &= ~REQ_F_BUFFER_SELECTED;
 	}
-
-	return ret;
 }
 
 static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
 {
+	unsigned int ret;
+
 	lockdep_assert_held(&req->ctx->completion_lock);
 
 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return 0;
-	return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
+
+	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+	__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
+	return ret;
 }
 
 static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 				       unsigned issue_flags)
 {
+	unsigned int ret;
 
-	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
+	if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED)))
 		return 0;
-	return __io_put_kbuf(req, issue_flags);
+
+	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
+	if (req->flags & REQ_F_BUFFER_RING)
+		__io_put_kbuf_ring(req);
+	else
+		__io_put_kbuf(req, issue_flags);
+	return ret;
 }
 #endif
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 02/10] io_uring: remove looping around handling traditional task_work
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
  2024-02-06 16:24 ` [PATCH 01/10] io_uring/kbuf: cleanup passing back cflags Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 03/10] io_uring: remove 'loops' argument from trace_io_uring_task_work_run() Jens Axboe
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

A previous commit added looping around handling traditional task_work
as an optimization, and while that may seem like a good idea, it's also
possible to run into application starvation doing so. If the task_work
generation is bursty, we can get very deep task_work queues, and we can
end up looping in here for a very long time.

One immediately observable problem with that is handling network traffic
using provided buffers, where flooding incoming traffic and looping
task_work handling will very quickly lead to buffer starvation as we
keep running task_work rather than returning to the application so it
can handle the associated CQEs and also provide buffers back.

Fixes: 3a0c037b0e16 ("io_uring: batch task_work")
Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 45 +++++++--------------------------------------
 1 file changed, 7 insertions(+), 38 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9b499864f10d..ae5b38355864 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1175,12 +1175,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 
 static unsigned int handle_tw_list(struct llist_node *node,
 				   struct io_ring_ctx **ctx,
-				   struct io_tw_state *ts,
-				   struct llist_node *last)
+				   struct io_tw_state *ts)
 {
 	unsigned int count = 0;
 
-	while (node && node != last) {
+	do {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1204,7 +1203,7 @@ static unsigned int handle_tw_list(struct llist_node *node,
 			*ctx = NULL;
 			cond_resched();
 		}
-	}
+	} while (node);
 
 	return count;
 }
@@ -1223,22 +1222,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head,
 	return xchg(&head->first, new);
 }
 
-/**
- * io_llist_cmpxchg - possibly swap all entries in a lock-less list
- * @head:	the head of lock-less list to delete all entries
- * @old:	expected old value of the first entry of the list
- * @new:	new entry as the head of the list
- *
- * perform a cmpxchg on the first entry of the list.
- */
-
-static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
-						  struct llist_node *old,
-						  struct llist_node *new)
-{
-	return cmpxchg(&head->first, old, new);
-}
-
 static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
 {
 	struct llist_node *node = llist_del_all(&tctx->task_list);
@@ -1273,9 +1256,7 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
-	struct llist_node fake = {};
 	struct llist_node *node;
-	unsigned int loops = 0;
 	unsigned int count = 0;
 
 	if (unlikely(current->flags & PF_EXITING)) {
@@ -1283,21 +1264,9 @@ void tctx_task_work(struct callback_head *cb)
 		return;
 	}
 
-	do {
-		loops++;
-		node = io_llist_xchg(&tctx->task_list, &fake);
-		count += handle_tw_list(node, &ctx, &ts, &fake);
-
-		/* skip expensive cmpxchg if there are items in the list */
-		if (READ_ONCE(tctx->task_list.first) != &fake)
-			continue;
-		if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
-			io_submit_flush_completions(ctx);
-			if (READ_ONCE(tctx->task_list.first) != &fake)
-				continue;
-		}
-		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
-	} while (node != &fake);
+	node = llist_del_all(&tctx->task_list);
+	if (node)
+		count = handle_tw_list(node, &ctx, &ts);
 
 	ctx_flush_and_put(ctx, &ts);
 
@@ -1305,7 +1274,7 @@ void tctx_task_work(struct callback_head *cb)
 	if (unlikely(atomic_read(&tctx->in_cancel)))
 		io_uring_drop_tctx_refs(current);
 
-	trace_io_uring_task_work_run(tctx, count, loops);
+	trace_io_uring_task_work_run(tctx, count, 1);
 }
 
 static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 03/10] io_uring: remove 'loops' argument from trace_io_uring_task_work_run()
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
  2024-02-06 16:24 ` [PATCH 01/10] io_uring/kbuf: cleanup passing back cflags Jens Axboe
  2024-02-06 16:24 ` [PATCH 02/10] io_uring: remove looping around handling traditional task_work Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 04/10] io_uring: handle traditional task_work in FIFO order Jens Axboe
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We no longer loop in task_work handling, hence delete the argument from
the tracepoint as it's always 1 and hence not very informative.

Signed-off-by: Jens Axboe <[email protected]>
---
 include/trace/events/io_uring.h | 10 +++-------
 io_uring/io_uring.c             |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3d7704a52b73..6bb4aaba9e9c 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -602,29 +602,25 @@ TRACE_EVENT(io_uring_cqe_overflow,
  *
  * @tctx:		pointer to a io_uring_task
  * @count:		how many functions it ran
- * @loops:		how many loops it ran
  *
  */
 TRACE_EVENT(io_uring_task_work_run,
 
-	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+	TP_PROTO(void *tctx, unsigned int count),
 
-	TP_ARGS(tctx, count, loops),
+	TP_ARGS(tctx, count),
 
 	TP_STRUCT__entry (
 		__field(  void *,		tctx		)
 		__field(  unsigned int,		count		)
-		__field(  unsigned int,		loops		)
 	),
 
 	TP_fast_assign(
 		__entry->tctx		= tctx;
 		__entry->count		= count;
-		__entry->loops		= loops;
 	),
 
-	TP_printk("tctx %p, count %u, loops %u",
-		 __entry->tctx, __entry->count, __entry->loops)
+	TP_printk("tctx %p, count %u", __entry->tctx, __entry->count)
 );
 
 TRACE_EVENT(io_uring_short_write,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ae5b38355864..ced15a13fcbb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1274,7 +1274,7 @@ void tctx_task_work(struct callback_head *cb)
 	if (unlikely(atomic_read(&tctx->in_cancel)))
 		io_uring_drop_tctx_refs(current);
 
-	trace_io_uring_task_work_run(tctx, count, 1);
+	trace_io_uring_task_work_run(tctx, count);
 }
 
 static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 04/10] io_uring: handle traditional task_work in FIFO order
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (2 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 03/10] io_uring: remove 'loops' argument from trace_io_uring_task_work_run() Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 05/10] io_uring: remove next io_kiocb fetch in task_work running Jens Axboe
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For local task_work, which is used if IORING_SETUP_DEFER_TASKRUN is set,
we reverse the order of the lockless list before processing the work.
This is done to process items in the order in which they were queued, as
the llist always adds to the head.

Do the same for traditional task_work, so we have the same behavior for
both types.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ced15a13fcbb..47d06bc55c95 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1266,7 +1266,7 @@ void tctx_task_work(struct callback_head *cb)
 
 	node = llist_del_all(&tctx->task_list);
 	if (node)
-		count = handle_tw_list(node, &ctx, &ts);
+		count = handle_tw_list(llist_reverse_order(node), &ctx, &ts);
 
 	ctx_flush_and_put(ctx, &ts);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 05/10] io_uring: remove next io_kiocb fetch in task_work running
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (3 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 04/10] io_uring: handle traditional task_work in FIFO order Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 06/10] io_uring: remove unconditional looping in local task_work handling Jens Axboe
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We just reversed the task_work list and that will have touched requests
as well, just get rid of this optimization as it should not make a
difference anymore.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 47d06bc55c95..a587b240fa48 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1184,8 +1184,6 @@ static unsigned int handle_tw_list(struct llist_node *node,
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
-		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
 		if (req->ctx != *ctx) {
 			ctx_flush_and_put(*ctx, ts);
 			*ctx = req->ctx;
@@ -1408,7 +1406,6 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
-		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
 				req, ts);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 06/10] io_uring: remove unconditional looping in local task_work handling
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (4 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 05/10] io_uring: remove next io_kiocb fetch in task_work running Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 07/10] io_uring/poll: improve readability of poll reference decrementing Jens Axboe
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

If we have a ton of notifications coming in, we can be looping in here
for a long time. This can be problematic for various reasons, mostly
because we can starve userspace. If the application is waiting on N
events, then only re-run if we need more events.

Fixes: c0e0d6ba25f1 ("io_uring: add IORING_SETUP_DEFER_TASKRUN")
Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a587b240fa48..ddbce269b6a7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1386,7 +1386,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 	}
 }
 
-static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
+static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
+				       int min_events)
+{
+	if (llist_empty(&ctx->work_llist))
+		return false;
+	if (events < min_events)
+		return true;
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+	return false;
+}
+
+static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
+			       int min_events)
 {
 	struct llist_node *node;
 	unsigned int loops = 0;
@@ -1414,18 +1427,20 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 	}
 	loops++;
 
-	if (!llist_empty(&ctx->work_llist))
+	if (io_run_local_work_continue(ctx, ret, min_events))
 		goto again;
 	if (ts->locked) {
 		io_submit_flush_completions(ctx);
-		if (!llist_empty(&ctx->work_llist))
+		if (io_run_local_work_continue(ctx, ret, min_events))
 			goto again;
 	}
+
 	trace_io_uring_local_work_run(ctx, ret, loops);
 	return ret;
 }
 
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
+					   int min_events)
 {
 	struct io_tw_state ts = { .locked = true, };
 	int ret;
@@ -1433,20 +1448,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
 	if (llist_empty(&ctx->work_llist))
 		return 0;
 
-	ret = __io_run_local_work(ctx, &ts);
+	ret = __io_run_local_work(ctx, &ts, min_events);
 	/* shouldn't happen! */
 	if (WARN_ON_ONCE(!ts.locked))
 		mutex_lock(&ctx->uring_lock);
 	return ret;
 }
 
-static int io_run_local_work(struct io_ring_ctx *ctx)
+static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
 {
 	struct io_tw_state ts = {};
 	int ret;
 
 	ts.locked = mutex_trylock(&ctx->uring_lock);
-	ret = __io_run_local_work(ctx, &ts);
+	ret = __io_run_local_work(ctx, &ts, min_events);
 	if (ts.locked)
 		mutex_unlock(&ctx->uring_lock);
 
@@ -1642,7 +1657,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 		    io_task_work_pending(ctx)) {
 			u32 tail = ctx->cached_cq_tail;
 
-			(void) io_run_local_work_locked(ctx);
+			(void) io_run_local_work_locked(ctx, min);
 
 			if (task_work_pending(current) ||
 			    wq_list_empty(&ctx->iopoll_list)) {
@@ -2486,7 +2501,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
 {
 	if (!llist_empty(&ctx->work_llist)) {
 		__set_current_state(TASK_RUNNING);
-		if (io_run_local_work(ctx) > 0)
+		if (io_run_local_work(ctx, INT_MAX) > 0)
 			return 0;
 	}
 	if (io_run_task_work() > 0)
@@ -2554,7 +2569,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	if (!io_allowed_run_tw(ctx))
 		return -EEXIST;
 	if (!llist_empty(&ctx->work_llist))
-		io_run_local_work(ctx);
+		io_run_local_work(ctx, min_events);
 	io_run_task_work();
 	io_cqring_overflow_flush(ctx);
 	/* if user messes with these they will just get an early return */
@@ -2592,11 +2607,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
+		int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
 		unsigned long check_cq;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-			int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
-
 			atomic_set(&ctx->cq_wait_nr, nr_wait);
 			set_current_state(TASK_INTERRUPTIBLE);
 		} else {
@@ -2615,7 +2629,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		 */
 		io_run_task_work();
 		if (!llist_empty(&ctx->work_llist))
-			io_run_local_work(ctx);
+			io_run_local_work(ctx, nr_wait);
 
 		/*
 		 * Non-local task_work will be run on exit to userspace, but
@@ -3270,7 +3284,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    io_allowed_defer_tw_run(ctx))
-		ret |= io_run_local_work(ctx) > 0;
+		ret |= io_run_local_work(ctx, INT_MAX) > 0;
 	ret |= io_cancel_defer_files(ctx, task, cancel_all);
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
@@ -3632,7 +3646,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			 * it should handle ownership problems if any.
 			 */
 			if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
-				(void)io_run_local_work_locked(ctx);
+				(void)io_run_local_work_locked(ctx, min_complete);
 		}
 		mutex_unlock(&ctx->uring_lock);
 	}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 07/10] io_uring/poll: improve readability of poll reference decrementing
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (5 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 06/10] io_uring: remove unconditional looping in local task_work handling Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 08/10] io_uring: cleanup handle_tw_list() calling convention Jens Axboe
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This overly long line is hard to read. Break it up by AND'ing the
ref mask first, then perform the atomic_sub_return() with the value
itself.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/poll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 3f3380dc5f68..8e01c2672df4 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -343,8 +343,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 		 * Release all references, retry if someone tried to restart
 		 * task_work while we were executing it.
 		 */
-	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
-					IO_POLL_REF_MASK);
+		v &= IO_POLL_REF_MASK;
+	} while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK);
 
 	return IOU_POLL_NO_ACTION;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 08/10] io_uring: cleanup handle_tw_list() calling convention
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (6 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 07/10] io_uring/poll: improve readability of poll reference decrementing Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 09/10] io_uring: pass in counter to handle_tw_list() rather than return it Jens Axboe
  2024-02-06 16:24 ` [PATCH 10/10] io_uring/sqpoll: manage task_work privately Jens Axboe
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Now that we don't loop around task_work anymore, there's no point in
maintaining the ring and locked state outside of handle_tw_list(). Get
rid of passing in those pointers (and pointers to pointers) and just do
the management internally in handle_tw_list().

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ddbce269b6a7..df02ed6677c5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1173,10 +1173,10 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 	percpu_ref_put(&ctx->refs);
 }
 
-static unsigned int handle_tw_list(struct llist_node *node,
-				   struct io_ring_ctx **ctx,
-				   struct io_tw_state *ts)
+static unsigned int handle_tw_list(struct llist_node *node)
 {
+	struct io_ring_ctx *ctx = NULL;
+	struct io_tw_state ts = { };
 	unsigned int count = 0;
 
 	do {
@@ -1184,25 +1184,26 @@ static unsigned int handle_tw_list(struct llist_node *node,
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
-		if (req->ctx != *ctx) {
-			ctx_flush_and_put(*ctx, ts);
-			*ctx = req->ctx;
+		if (req->ctx != ctx) {
+			ctx_flush_and_put(ctx, &ts);
+			ctx = req->ctx;
 			/* if not contended, grab and improve batching */
-			ts->locked = mutex_trylock(&(*ctx)->uring_lock);
-			percpu_ref_get(&(*ctx)->refs);
+			ts.locked = mutex_trylock(&ctx->uring_lock);
+			percpu_ref_get(&ctx->refs);
 		}
 		INDIRECT_CALL_2(req->io_task_work.func,
 				io_poll_task_func, io_req_rw_complete,
-				req, ts);
+				req, &ts);
 		node = next;
 		count++;
 		if (unlikely(need_resched())) {
-			ctx_flush_and_put(*ctx, ts);
-			*ctx = NULL;
+			ctx_flush_and_put(ctx, &ts);
+			ctx = NULL;
 			cond_resched();
 		}
 	} while (node);
 
+	ctx_flush_and_put(ctx, &ts);
 	return count;
 }
 
@@ -1250,8 +1251,6 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
 
 void tctx_task_work(struct callback_head *cb)
 {
-	struct io_tw_state ts = {};
-	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
 	struct llist_node *node;
@@ -1264,9 +1263,7 @@ void tctx_task_work(struct callback_head *cb)
 
 	node = llist_del_all(&tctx->task_list);
 	if (node)
-		count = handle_tw_list(llist_reverse_order(node), &ctx, &ts);
-
-	ctx_flush_and_put(ctx, &ts);
+		count = handle_tw_list(llist_reverse_order(node));
 
 	/* relaxed read is enough as only the task itself sets ->in_cancel */
 	if (unlikely(atomic_read(&tctx->in_cancel)))
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 09/10] io_uring: pass in counter to handle_tw_list() rather than return it
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (7 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 08/10] io_uring: cleanup handle_tw_list() calling convention Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  2024-02-06 16:24 ` [PATCH 10/10] io_uring/sqpoll: manage task_work privately Jens Axboe
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

No functional changes in this patch, just in preparation for returning
something other than count from this helper.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index df02ed6677c5..20421bf36cc1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1173,11 +1173,10 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 	percpu_ref_put(&ctx->refs);
 }
 
-static unsigned int handle_tw_list(struct llist_node *node)
+static void handle_tw_list(struct llist_node *node, unsigned int *count)
 {
 	struct io_ring_ctx *ctx = NULL;
 	struct io_tw_state ts = { };
-	unsigned int count = 0;
 
 	do {
 		struct llist_node *next = node->next;
@@ -1195,7 +1194,7 @@ static unsigned int handle_tw_list(struct llist_node *node)
 				io_poll_task_func, io_req_rw_complete,
 				req, &ts);
 		node = next;
-		count++;
+		(*count)++;
 		if (unlikely(need_resched())) {
 			ctx_flush_and_put(ctx, &ts);
 			ctx = NULL;
@@ -1204,7 +1203,6 @@ static unsigned int handle_tw_list(struct llist_node *node)
 	} while (node);
 
 	ctx_flush_and_put(ctx, &ts);
-	return count;
 }
 
 /**
@@ -1263,7 +1261,7 @@ void tctx_task_work(struct callback_head *cb)
 
 	node = llist_del_all(&tctx->task_list);
 	if (node)
-		count = handle_tw_list(llist_reverse_order(node));
+		handle_tw_list(llist_reverse_order(node), &count);
 
 	/* relaxed read is enough as only the task itself sets ->in_cancel */
 	if (unlikely(atomic_read(&tctx->in_cancel)))
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 10/10] io_uring/sqpoll: manage task_work privately
  2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
                   ` (8 preceding siblings ...)
  2024-02-06 16:24 ` [PATCH 09/10] io_uring: pass in counter to handle_tw_list() rather than return it Jens Axboe
@ 2024-02-06 16:24 ` Jens Axboe
  9 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2024-02-06 16:24 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

Decouple from task_work running, and cap the number of entries we process
at the time. If we exceed that number, push remaining entries to a retry
list that we'll process first next time.

We cap the number of entries to process at 8, which is fairly random.
We just want to get enough per-ctx batching here, while not processing
endlessly.

Since we manually run PF_IO_WORKER related task_work anyway as the task
never exits to userspace, with this we no longer need to add an actual
task_work item to the per-process list.

Signed-off-by: Jens Axboe <[email protected]>
---
 io_uring/io_uring.c | 46 +++++++++++++++++++++++++++++++++++----------
 io_uring/io_uring.h | 24 +++++++++++++++++------
 io_uring/sqpoll.c   | 29 +++++++++++++++++++++++++++-
 3 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 20421bf36cc1..9ba2244c624e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1173,7 +1173,14 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct llist_node *node, unsigned int *count)
+/*
+ * Run queued task_work, returning the number of entries processed in *count.
+ * If more entries than max_entries are available, stop processing once this
+ * is reached and return the rest of the list.
+ */
+struct llist_node *io_handle_tw_list(struct llist_node *node,
+				     unsigned int *count,
+				     unsigned int max_entries)
 {
 	struct io_ring_ctx *ctx = NULL;
 	struct io_tw_state ts = { };
@@ -1200,9 +1207,10 @@ static void handle_tw_list(struct llist_node *node, unsigned int *count)
 			ctx = NULL;
 			cond_resched();
 		}
-	} while (node);
+	} while (node && *count < max_entries);
 
 	ctx_flush_and_put(ctx, &ts);
+	return node;
 }
 
 /**
@@ -1247,27 +1255,41 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
 	}
 }
 
-void tctx_task_work(struct callback_head *cb)
+struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
+				      unsigned int max_entries,
+				      unsigned int *count)
 {
-	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
-						  task_work);
 	struct llist_node *node;
-	unsigned int count = 0;
 
 	if (unlikely(current->flags & PF_EXITING)) {
 		io_fallback_tw(tctx, true);
-		return;
+		return NULL;
 	}
 
 	node = llist_del_all(&tctx->task_list);
-	if (node)
-		handle_tw_list(llist_reverse_order(node), &count);
+	if (node) {
+		node = llist_reverse_order(node);
+		node = io_handle_tw_list(node, count, max_entries);
+	}
 
 	/* relaxed read is enough as only the task itself sets ->in_cancel */
 	if (unlikely(atomic_read(&tctx->in_cancel)))
 		io_uring_drop_tctx_refs(current);
 
-	trace_io_uring_task_work_run(tctx, count);
+	trace_io_uring_task_work_run(tctx, *count);
+	return node;
+}
+
+void tctx_task_work(struct callback_head *cb)
+{
+	struct io_uring_task *tctx;
+	struct llist_node *ret;
+	unsigned int count = 0;
+
+	tctx = container_of(cb, struct io_uring_task, task_work);
+	ret = tctx_task_work_run(tctx, UINT_MAX, &count);
+	/* can't happen */
+	WARN_ON_ONCE(ret);
 }
 
 static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
@@ -1350,6 +1372,10 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 
+	/* SQPOLL doesn't need the task_work added, it'll run it itself */
+	if (ctx->flags & IORING_SETUP_SQPOLL)
+		return;
+
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 46795ee462df..38af82788786 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -57,6 +57,8 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
+struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries);
+struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
 void tctx_task_work(struct callback_head *cb);
 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
@@ -275,6 +277,8 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 
 static inline int io_run_task_work(void)
 {
+	bool ret = false;
+
 	/*
 	 * Always check-and-clear the task_work notification signal. With how
 	 * signaling works for task_work, we can find it set with nothing to
@@ -286,18 +290,26 @@ static inline int io_run_task_work(void)
 	 * PF_IO_WORKER never returns to userspace, so check here if we have
 	 * notify work that needs processing.
 	 */
-	if (current->flags & PF_IO_WORKER &&
-	    test_thread_flag(TIF_NOTIFY_RESUME)) {
-		__set_current_state(TASK_RUNNING);
-		resume_user_mode_work(NULL);
+	if (current->flags & PF_IO_WORKER) {
+		if (test_thread_flag(TIF_NOTIFY_RESUME)) {
+			__set_current_state(TASK_RUNNING);
+			resume_user_mode_work(NULL);
+		}
+		if (current->io_uring) {
+			unsigned int count = 0;
+
+			tctx_task_work_run(current->io_uring, UINT_MAX, &count);
+			if (count)
+				ret = true;
+		}
 	}
 	if (task_work_pending(current)) {
 		__set_current_state(TASK_RUNNING);
 		task_work_run();
-		return 1;
+		ret = true;
 	}
 
-	return 0;
+	return ret;
 }
 
 static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 65b5dbe3c850..28bf0e085d31 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -18,6 +18,7 @@
 #include "sqpoll.h"
 
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
+#define IORING_TW_CAP_ENTRIES_VALUE	8
 
 enum {
 	IO_SQ_THREAD_SHOULD_STOP = 0,
@@ -219,8 +220,31 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
 	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 }
 
+/*
+ * Run task_work, processing the retry_list first. The retry_list holds
+ * entries that we passed on in the previous run, if we had more task_work
+ * than we were asked to process. Newly queued task_work isn't run until the
+ * retry list has been fully processed.
+ */
+static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	unsigned int count = 0;
+
+	if (*retry_list) {
+		*retry_list = io_handle_tw_list(*retry_list, &count, max_entries);
+		if (count >= max_entries)
+			return count;
+		max_entries -= count;
+	}
+
+	*retry_list = tctx_task_work_run(tctx, max_entries, &count);
+	return count;
+}
+
 static int io_sq_thread(void *data)
 {
+	struct llist_node *retry_list = NULL;
 	struct io_sq_data *sqd = data;
 	struct io_ring_ctx *ctx;
 	unsigned long timeout = 0;
@@ -257,7 +281,7 @@ static int io_sq_thread(void *data)
 			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
 				sqt_spin = true;
 		}
-		if (io_run_task_work())
+		if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
 			sqt_spin = true;
 
 		if (sqt_spin || !time_after(jiffies, timeout)) {
@@ -312,6 +336,9 @@ static int io_sq_thread(void *data)
 		timeout = jiffies + sqd->sq_thread_idle;
 	}
 
+	if (retry_list)
+		io_sq_tw(&retry_list, UINT_MAX);
+
 	io_uring_cancel_generic(true, sqd);
 	sqd->thread = NULL;
 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-02-06 16:27 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-06 16:24 [PATCHSET next 0/10] Various network improvements Jens Axboe
2024-02-06 16:24 ` [PATCH 01/10] io_uring/kbuf: cleanup passing back cflags Jens Axboe
2024-02-06 16:24 ` [PATCH 02/10] io_uring: remove looping around handling traditional task_work Jens Axboe
2024-02-06 16:24 ` [PATCH 03/10] io_uring: remove 'loops' argument from trace_io_uring_task_work_run() Jens Axboe
2024-02-06 16:24 ` [PATCH 04/10] io_uring: handle traditional task_work in FIFO order Jens Axboe
2024-02-06 16:24 ` [PATCH 05/10] io_uring: remove next io_kiocb fetch in task_work running Jens Axboe
2024-02-06 16:24 ` [PATCH 06/10] io_uring: remove unconditional looping in local task_work handling Jens Axboe
2024-02-06 16:24 ` [PATCH 07/10] io_uring/poll: improve readability of poll reference decrementing Jens Axboe
2024-02-06 16:24 ` [PATCH 08/10] io_uring: cleanup handle_tw_list() calling convention Jens Axboe
2024-02-06 16:24 ` [PATCH 09/10] io_uring: pass in counter to handle_tw_list() rather than return it Jens Axboe
2024-02-06 16:24 ` [PATCH 10/10] io_uring/sqpoll: manage task_work privately Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox