public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH RFC for-next 0/8] io_uring: tw contention improvments
@ 2022-06-20 16:18 Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Task work currently uses a spin lock to guard task_list and
task_running. Some use cases such as networking can trigger task_work_add
from multiple threads all at once, which suffers from contention here.

This can be changed to use a lockless list which seems to have better
performance. Running the micro benchmark in [1] I see 20% improvment in
multithreaded task work add. It required removing the priority tw list
optimisation, however it isn't clear how important that optimisation is.
Additionally it has fairly easy to break semantics.

Patch 1-2 remove the priority tw list optimisation
Patch 3-5 add lockless lists for task work
Patch 6 fixes a bug I noticed in io_uring event tracing
Patch 7-8 adds tracing for task_work_run

Dylan Yudaken (8):
  io_uring: remove priority tw list optimisation
  io_uring: remove __io_req_task_work_add
  io_uring: lockless task list
  io_uring: introduce llist helpers
  io_uring: batch task_work
  io_uring: move io_uring_get_opcode out of TP_printk
  io_uring: add trace event for running task work
  io_uring: trace task_work_run

 include/linux/io_uring_types.h  |   2 +-
 include/trace/events/io_uring.h |  72 +++++++++++++--
 io_uring/io_uring.c             | 150 ++++++++++++--------------------
 io_uring/io_uring.h             |   1 -
 io_uring/rw.c                   |   2 +-
 io_uring/tctx.c                 |   4 +-
 io_uring/tctx.h                 |   7 +-
 7 files changed, 127 insertions(+), 111 deletions(-)


base-commit: 094abe8fbccb0d79bef982c67eb7372e92452c0e
-- 
2.30.2



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

This optimisation has some built in assumptions that make it easy to
introduce bugs. It also does not have clear wins that make it worth keeping.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 77 +++++++--------------------------------------
 io_uring/io_uring.h |  1 -
 io_uring/rw.c       |  2 +-
 io_uring/tctx.c     |  1 -
 io_uring/tctx.h     |  1 -
 5 files changed, 12 insertions(+), 70 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index afda42246d12..cc524d33748d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_prev_tw_list(struct io_wq_work_node *node,
-				struct io_ring_ctx **ctx, bool *uring_locked)
-{
-	if (*ctx && !*uring_locked)
-		spin_lock(&(*ctx)->completion_lock);
-
-	do {
-		struct io_wq_work_node *next = node->next;
-		struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    io_task_work.node);
-
-		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
-		if (req->ctx != *ctx) {
-			if (unlikely(!*uring_locked && *ctx))
-				io_cq_unlock_post(*ctx);
-
-			ctx_flush_and_put(*ctx, uring_locked);
-			*ctx = req->ctx;
-			/* if not contended, grab and improve batching */
-			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
-			percpu_ref_get(&(*ctx)->refs);
-			if (unlikely(!*uring_locked))
-				io_cq_lock(*ctx);
-		}
-		if (likely(*uring_locked)) {
-			req->io_task_work.func(req, uring_locked);
-		} else {
-			req->cqe.flags = io_put_kbuf_comp(req);
-			__io_req_complete_post(req);
-		}
-		node = next;
-	} while (node);
-
-	if (unlikely(!*uring_locked))
-		io_cq_unlock_post(*ctx);
-}
-
 static void handle_tw_list(struct io_wq_work_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
@@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 
 	while (1) {
-		struct io_wq_work_node *node1, *node2;
+		struct io_wq_work_node *node;
 
 		spin_lock_irq(&tctx->task_lock);
-		node1 = tctx->prio_task_list.first;
-		node2 = tctx->task_list.first;
+		node = tctx->task_list.first;
 		INIT_WQ_LIST(&tctx->task_list);
-		INIT_WQ_LIST(&tctx->prio_task_list);
-		if (!node2 && !node1)
+		if (!node)
 			tctx->task_running = false;
 		spin_unlock_irq(&tctx->task_lock);
-		if (!node2 && !node1)
+		if (!node)
 			break;
-
-		if (node1)
-			handle_prev_tw_list(node1, &ctx, &uring_locked);
-		if (node2)
-			handle_tw_list(node2, &ctx, &uring_locked);
+		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
 
-		if (data_race(!tctx->task_list.first) &&
-		    data_race(!tctx->prio_task_list.first) && uring_locked)
+		if (data_race(!tctx->task_list.first) && uring_locked)
 			io_submit_flush_completions(ctx);
 	}
 
@@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb)
 }
 
 static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx,
-				   struct io_wq_work_list *list)
+				   struct io_uring_task *tctx)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
@@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	bool running;
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, list);
+	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 	running = tctx->task_running;
 	if (!running)
 		tctx->task_running = true;
@@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	tctx->task_running = false;
-	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
+	node = tctx->task_list.first;
+	INIT_WQ_LIST(&tctx->task_list);
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	while (node) {
@@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 
-	__io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-void io_req_task_prio_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	if (req->ctx->flags & IORING_SETUP_SQPOLL)
-		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
-	else
-		__io_req_task_work_add(req, tctx, &tctx->task_list);
+	__io_req_task_work_add(req, tctx);
 }
 
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7a00bbe85d35..864e552c76a5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -35,7 +35,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 9166d8166b82..caf72f50c341 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, res, 0);
 	req->io_task_work.func = io_req_task_complete;
-	io_req_task_prio_work_add(req);
+	io_req_task_work_add(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 9b30fb0d3603..7a68ba9beec3 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
-	INIT_WQ_LIST(&tctx->prio_task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dead0ed00429..c8566ea5dca4 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -22,7 +22,6 @@ struct io_uring_task {
 		spinlock_t		task_lock;
 		bool			task_running;
 		struct io_wq_work_list	task_list;
-		struct io_wq_work_list	prio_task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

this is no longer needed as there is only one caller

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc524d33748d..e1523b62103b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb)
 		io_uring_drop_tctx_refs(current);
 }
 
-static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx)
+void io_req_task_work_add(struct io_kiocb *req)
 {
+	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
 	unsigned long flags;
@@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	}
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	__io_req_task_work_add(req, tctx);
-}
-
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 {
 	io_req_complete_post(req);
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 3/8] io_uring: lockless task list
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 38 ++++++++--------------------------
 io_uring/tctx.c                |  3 +--
 io_uring/tctx.h                |  6 +++---
 4 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca38..918165a20053 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
 	union {
-		struct io_wq_work_node	node;
+		struct llist_node	node;
 		struct llist_node	fallback_node;
 	};
 	io_req_tw_func_t		func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1523b62103b..985b46dfebb6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
 	do {
-		struct io_wq_work_node *next = node->next;
+		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
+	struct llist_node *node = llist_del_all(&tctx->task_list);
 
-	while (1) {
-		struct io_wq_work_node *node;
-
-		spin_lock_irq(&tctx->task_lock);
-		node = tctx->task_list.first;
-		INIT_WQ_LIST(&tctx->task_list);
-		if (!node)
-			tctx->task_running = false;
-		spin_unlock_irq(&tctx->task_lock);
-		if (!node)
-			break;
+	if (node) {
 		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
-
-		if (data_race(!tctx->task_list.first) && uring_locked)
-			io_submit_flush_completions(ctx);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_wq_work_node *node;
-	unsigned long flags;
+	struct llist_node *node;
 	bool running;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
-	running = tctx->task_running;
-	if (!running)
-		tctx->task_running = true;
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+	running = !llist_add(&req->io_task_work.node, &tctx->task_list);
 
 	/* task_work already pending, we're done */
 	if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	tctx->task_running = false;
-	node = tctx->task_list.first;
-	INIT_WQ_LIST(&tctx->task_list);
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3..7f97d97fef0a 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
-	spin_lock_init(&tctx->task_lock);
-	INIT_WQ_LIST(&tctx->task_list);
+	init_llist_head(&tctx->task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca4..8a33ff6e5d91 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/llist.h>
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -19,9 +21,7 @@ struct io_uring_task {
 	struct percpu_counter		inflight;
 
 	struct { /* task_work */
-		spinlock_t		task_lock;
-		bool			task_running;
-		struct io_wq_work_list	task_list;
+		struct llist_head	task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 4/8] io_uring: introduce llist helpers
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (2 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Introduce helpers to atomically switch llist.

Will later move this into common code

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 985b46dfebb6..eb29e3f7da5c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node,
 	} while (node);
 }
 
+/**
+ * io_llist_xchg - swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @new:	new entry as the head of the list
+ *
+ * If list is empty, return NULL, otherwise, return the pointer to the first entry.
+ * The order of entries returned is from the newest to the oldest added one.
+ */
+static inline struct llist_node *io_llist_xchg(struct llist_head *head,
+					       struct llist_node *node)
+{
+	return xchg(&head->first, node);
+}
+
+/**
+ * io_llist_xchg - possibly swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @old:	expected old value of the first entry of the list
+ * @new:	new entry as the head of the list
+ *
+ * perform a cmpxchg on the first entry of the list.
+ */
+
+static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
+						  struct llist_node *old,
+						  struct llist_node *new)
+{
+	return cmpxchg(&head->first, old, new);
+}
+
 void tctx_task_work(struct callback_head *cb)
 {
 	bool uring_locked = false;
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 5/8] io_uring: batch task_work
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (3 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Batching task work up is an important performance optimisation, as
task_work_add is expensive.

In order to keep the semantics replace the task_list with a fake node
while processing the old list, and then do a cmpxchg at the end to see if
there is more work.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eb29e3f7da5c..0b4d3eb06d16 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -988,9 +988,10 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 
 
 static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked)
+			   struct io_ring_ctx **ctx, bool *locked,
+			   struct llist_node *fake)
 {
-	do {
+	while (node && node != fake) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1006,7 +1007,10 @@ static void handle_tw_list(struct llist_node *node,
 		}
 		req->io_task_work.func(req, locked);
 		node = next;
-	} while (node);
+		count++;
+	}
+
+	return count;
 }
 
 /**
@@ -1045,11 +1049,15 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
-	struct llist_node *node = llist_del_all(&tctx->task_list);
-
-	if (node) {
-		handle_tw_list(node, &ctx, &uring_locked);
-		cond_resched();
+	struct llist_node fake = {};
+	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+
+	handle_tw_list(node, &ctx, &uring_locked, &fake);
+	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
+	while (node != &fake) {
+		node = io_llist_xchg(&tctx->task_list, &fake);
+		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (4 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
  2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

The TP_printk macro's are not supposed to use custom code ([1]) or else
tools such as perf cannot use these events.

Convert the opcode string representation to use the __string wiring that
the event framework provides ([2]).

[1]: https://lwn.net/Articles/379903/
[2]: https://lwn.net/Articles/381064/

Fixes: 033b87d2 ("io_uring: use the text representation of ops in trace")
Signed-off-by: Dylan Yudaken <[email protected]>
---
 include/trace/events/io_uring.h | 42 +++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 5635912e1013..3bc8dec9acaa 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -151,6 +151,8 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  unsigned int,			flags		)
 		__field(  struct io_wq_work *,		work		)
 		__field(  int,				rw		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
 
 	TP_fast_assign(
@@ -161,11 +163,13 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__entry->opcode		= req->opcode;
 		__entry->work		= &req->work;
 		__entry->rw		= rw;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode),
+		__get_str(op_str),
 		__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
 );
 
@@ -188,6 +192,8 @@ TRACE_EVENT(io_uring_defer,
 		__field(  void *,		req	)
 		__field(  unsigned long long,	data	)
 		__field(  u8,			opcode	)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -195,11 +201,13 @@ TRACE_EVENT(io_uring_defer,
 		__entry->req	= req;
 		__entry->data	= req->cqe.user_data;
 		__entry->opcode	= req->opcode;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
 		__entry->ctx, __entry->req, __entry->data,
-		io_uring_get_opcode(__entry->opcode))
+		__get_str(op_str))
 );
 
 /**
@@ -284,6 +292,8 @@ TRACE_EVENT(io_uring_fail_link,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  void *,		link		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -292,11 +302,13 @@ TRACE_EVENT(io_uring_fail_link,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->link		= link;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode), __entry->link)
+		__get_str(op_str), __entry->link)
 );
 
 /**
@@ -370,6 +382,8 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__field(  u32,			flags		)
 		__field(  bool,			force_nonblock	)
 		__field(  bool,			sq_thread	)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -380,11 +394,13 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__entry->flags		= req->flags;
 		__entry->force_nonblock	= force_nonblock;
 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
 		  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
-		  __entry->user_data, io_uring_get_opcode(__entry->opcode),
+		  __entry->user_data, __get_str(op_str),
 		  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
 );
 
@@ -411,6 +427,8 @@ TRACE_EVENT(io_uring_poll_arm,
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
 		__field(  int,			events		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -420,11 +438,13 @@ TRACE_EVENT(io_uring_poll_arm,
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 		__entry->events		= events;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
 		  __entry->ctx, __entry->req, __entry->user_data,
-		  io_uring_get_opcode(__entry->opcode),
+		  __get_str(op_str),
 		  __entry->mask, __entry->events)
 );
 
@@ -447,6 +467,8 @@ TRACE_EVENT(io_uring_task_add,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -455,11 +477,13 @@ TRACE_EVENT(io_uring_task_add,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode),
+		__get_str(op_str),
 		__entry->mask)
 );
 
@@ -495,6 +519,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__field( u64,			pad1		)
 		__field( u64,			addr3		)
 		__field( int,			error		)
+
+		__string( op_str, io_uring_get_opcode(sqe->opcode) )
 	),
 
 	TP_fast_assign(
@@ -514,6 +540,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__entry->pad1		= sqe->__pad2[0];
 		__entry->addr3		= sqe->addr3;
 		__entry->error		= error;
+
+		__assign_str(op_str, io_uring_get_opcode(sqe->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, "
@@ -522,7 +550,7 @@ TRACE_EVENT(io_uring_req_failed,
 		  "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
 		  "error=%d",
 		  __entry->ctx, __entry->req, __entry->user_data,
-		  io_uring_get_opcode(__entry->opcode),
+		  __get_str(op_str),
 		  __entry->flags, __entry->ioprio,
 		  (unsigned long long)__entry->off,
 		  (unsigned long long) __entry->addr, __entry->len,
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 7/8] io_uring: add trace event for running task work
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (5 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
@ 2022-06-20 16:19 ` Dylan Yudaken
  2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:19 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

This is useful for investigating if task_work is batching

Signed-off-by: Dylan Yudaken <[email protected]>
---
 include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaa..918e3a43e4b2 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		  __entry->cflags, __entry->ocqe)
 );
 
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx:		pointer to a io_uring_task
+ * @count:		how many functions it ran
+ * @loops:		how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+	TP_ARGS(tctx, count, loops),
+
+	TP_STRUCT__entry (
+		__field(  void *,		tctx		)
+		__field(  unsigned int,		count		)
+		__field(  unsigned int,		loops		)
+	),
+
+	TP_fast_assign(
+		__entry->tctx		= tctx;
+		__entry->count		= count;
+		__entry->loops		= loops;
+	),
+
+	TP_printk("tctx %p, count %u, loops %u",
+		 __entry->tctx, __entry->count, __entry->loops)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH RFC for-next 8/8] io_uring: trace task_work_run
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (6 preceding siblings ...)
  2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
@ 2022-06-20 16:19 ` Dylan Yudaken
  7 siblings, 0 replies; 9+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:19 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

trace task_work_run to help provide stats on how often task work is run
and what batch sizes are coming through.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0b4d3eb06d16..8cf868315008 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -987,10 +987,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 }
 
 
-static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked,
-			   struct llist_node *fake)
+static unsigned int handle_tw_list(struct llist_node *node,
+				   struct io_ring_ctx **ctx, bool *locked,
+				   struct llist_node *fake)
 {
+	unsigned int count = 0;
+
 	while (node && node != fake) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1051,12 +1053,14 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 	struct llist_node fake = {};
 	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+	unsigned int loops = 1;
+	unsigned int count = handle_tw_list(node, &ctx, &uring_locked, &fake);
 
-	handle_tw_list(node, &ctx, &uring_locked, &fake);
 	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	while (node != &fake) {
+		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
-		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
@@ -1065,6 +1069,8 @@ void tctx_task_work(struct callback_head *cb)
 	/* relaxed read is enough as only the task itself sets ->in_idle */
 	if (unlikely(atomic_read(&tctx->in_idle)))
 		io_uring_drop_tctx_refs(current);
+
+	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
 void io_req_task_work_add(struct io_kiocb *req)
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-06-20 16:19 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox