[PATCH v2 for-next 0/8] io_uring: tw contention improvments

public inbox for [email protected]
 help / color / mirror / Atom feed

* [PATCH v2 for-next 0/8] io_uring: tw contention improvments
@ 2022-06-22 13:40 Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
                   ` (9 more replies)
  0 siblings, 10 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Task work currently uses a spin lock to guard task_list and
task_running. Some use cases such as networking can trigger task_work_add
from multiple threads all at once, which suffers from contention here.

This can be changed to use a lockless list which seems to have better
performance. Running the micro benchmark in [1] I see 20% improvment in
multithreaded task work add. It required removing the priority tw list
optimisation, however it isn't clear how important that optimisation is.
Additionally it has fairly easy to break semantics.

Patch 1-2 remove the priority tw list optimisation
Patch 3-5 add lockless lists for task work
Patch 6 fixes a bug I noticed in io_uring event tracing
Patch 7-8 adds tracing for task_work_run

v2 changes:
 - simplify comparison in handle_tw_list

Dylan Yudaken (8):
  io_uring: remove priority tw list optimisation
  io_uring: remove __io_req_task_work_add
  io_uring: lockless task list
  io_uring: introduce llist helpers
  io_uring: batch task_work
  io_uring: move io_uring_get_opcode out of TP_printk
  io_uring: add trace event for running task work
  io_uring: trace task_work_run

 include/linux/io_uring_types.h  |   2 +-
 include/trace/events/io_uring.h |  72 +++++++++++++--
 io_uring/io_uring.c             | 149 ++++++++++++--------------------
 io_uring/io_uring.h             |   1 -
 io_uring/rw.c                   |   2 +-
 io_uring/tctx.c                 |   4 +-
 io_uring/tctx.h                 |   7 +-
 7 files changed, 126 insertions(+), 111 deletions(-)


base-commit: 7b411672f03db4aa05dec1c96742fc02b99de3d4
-- 
2.30.2


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

This optimisation has some built in assumptions that make it easy to
introduce bugs. It also does not have clear wins that make it worth keeping.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 77 +++++++--------------------------------------
 io_uring/io_uring.h |  1 -
 io_uring/rw.c       |  2 +-
 io_uring/tctx.c     |  1 -
 io_uring/tctx.h     |  1 -
 5 files changed, 12 insertions(+), 70 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index afda42246d12..cc524d33748d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_prev_tw_list(struct io_wq_work_node *node,
-				struct io_ring_ctx **ctx, bool *uring_locked)
-{
-	if (*ctx && !*uring_locked)
-		spin_lock(&(*ctx)->completion_lock);
-
-	do {
-		struct io_wq_work_node *next = node->next;
-		struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    io_task_work.node);
-
-		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
-		if (req->ctx != *ctx) {
-			if (unlikely(!*uring_locked && *ctx))
-				io_cq_unlock_post(*ctx);
-
-			ctx_flush_and_put(*ctx, uring_locked);
-			*ctx = req->ctx;
-			/* if not contended, grab and improve batching */
-			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
-			percpu_ref_get(&(*ctx)->refs);
-			if (unlikely(!*uring_locked))
-				io_cq_lock(*ctx);
-		}
-		if (likely(*uring_locked)) {
-			req->io_task_work.func(req, uring_locked);
-		} else {
-			req->cqe.flags = io_put_kbuf_comp(req);
-			__io_req_complete_post(req);
-		}
-		node = next;
-	} while (node);
-
-	if (unlikely(!*uring_locked))
-		io_cq_unlock_post(*ctx);
-}
-
 static void handle_tw_list(struct io_wq_work_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
@@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 
 	while (1) {
-		struct io_wq_work_node *node1, *node2;
+		struct io_wq_work_node *node;
 
 		spin_lock_irq(&tctx->task_lock);
-		node1 = tctx->prio_task_list.first;
-		node2 = tctx->task_list.first;
+		node = tctx->task_list.first;
 		INIT_WQ_LIST(&tctx->task_list);
-		INIT_WQ_LIST(&tctx->prio_task_list);
-		if (!node2 && !node1)
+		if (!node)
 			tctx->task_running = false;
 		spin_unlock_irq(&tctx->task_lock);
-		if (!node2 && !node1)
+		if (!node)
 			break;
-
-		if (node1)
-			handle_prev_tw_list(node1, &ctx, &uring_locked);
-		if (node2)
-			handle_tw_list(node2, &ctx, &uring_locked);
+		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
 
-		if (data_race(!tctx->task_list.first) &&
-		    data_race(!tctx->prio_task_list.first) && uring_locked)
+		if (data_race(!tctx->task_list.first) && uring_locked)
 			io_submit_flush_completions(ctx);
 	}
 
@@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb)
 }
 
 static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx,
-				   struct io_wq_work_list *list)
+				   struct io_uring_task *tctx)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
@@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	bool running;
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, list);
+	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 	running = tctx->task_running;
 	if (!running)
 		tctx->task_running = true;
@@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	tctx->task_running = false;
-	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
+	node = tctx->task_list.first;
+	INIT_WQ_LIST(&tctx->task_list);
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	while (node) {
@@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 
-	__io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-void io_req_task_prio_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	if (req->ctx->flags & IORING_SETUP_SQPOLL)
-		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
-	else
-		__io_req_task_work_add(req, tctx, &tctx->task_list);
+	__io_req_task_work_add(req, tctx);
 }
 
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f026d2670959..f77e4a5403e4 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -36,7 +36,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index a308fc956114..e6cf1c3d8a29 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, res, 0);
 	req->io_task_work.func = io_req_task_complete;
-	io_req_task_prio_work_add(req);
+	io_req_task_work_add(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 9b30fb0d3603..7a68ba9beec3 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
-	INIT_WQ_LIST(&tctx->prio_task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dead0ed00429..c8566ea5dca4 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -22,7 +22,6 @@ struct io_uring_task {
 		spinlock_t		task_lock;
 		bool			task_running;
 		struct io_wq_work_list	task_list;
-		struct io_wq_work_list	prio_task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 3/8] io_uring: lockless task list Dylan Yudaken
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

this is no longer needed as there is only one caller

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc524d33748d..e1523b62103b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb)
 		io_uring_drop_tctx_refs(current);
 }
 
-static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx)
+void io_req_task_work_add(struct io_kiocb *req)
 {
+	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
 	unsigned long flags;
@@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	}
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	__io_req_task_work_add(req, tctx);
-}
-
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 {
 	io_req_complete_post(req);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 3/8] io_uring: lockless task list
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 38 ++++++++--------------------------
 io_uring/tctx.c                |  3 +--
 io_uring/tctx.h                |  6 +++---
 4 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca38..918165a20053 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
 	union {
-		struct io_wq_work_node	node;
+		struct llist_node	node;
 		struct llist_node	fallback_node;
 	};
 	io_req_tw_func_t		func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1523b62103b..985b46dfebb6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
 	do {
-		struct io_wq_work_node *next = node->next;
+		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
+	struct llist_node *node = llist_del_all(&tctx->task_list);
 
-	while (1) {
-		struct io_wq_work_node *node;
-
-		spin_lock_irq(&tctx->task_lock);
-		node = tctx->task_list.first;
-		INIT_WQ_LIST(&tctx->task_list);
-		if (!node)
-			tctx->task_running = false;
-		spin_unlock_irq(&tctx->task_lock);
-		if (!node)
-			break;
+	if (node) {
 		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
-
-		if (data_race(!tctx->task_list.first) && uring_locked)
-			io_submit_flush_completions(ctx);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_wq_work_node *node;
-	unsigned long flags;
+	struct llist_node *node;
 	bool running;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
-	running = tctx->task_running;
-	if (!running)
-		tctx->task_running = true;
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+	running = !llist_add(&req->io_task_work.node, &tctx->task_list);
 
 	/* task_work already pending, we're done */
 	if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	tctx->task_running = false;
-	node = tctx->task_list.first;
-	INIT_WQ_LIST(&tctx->task_list);
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3..7f97d97fef0a 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
-	spin_lock_init(&tctx->task_lock);
-	INIT_WQ_LIST(&tctx->task_list);
+	init_llist_head(&tctx->task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca4..8a33ff6e5d91 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/llist.h>
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -19,9 +21,7 @@ struct io_uring_task {
 	struct percpu_counter		inflight;
 
 	struct { /* task_work */
-		spinlock_t		task_lock;
-		bool			task_running;
-		struct io_wq_work_list	task_list;
+		struct llist_head	task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 4/8] io_uring: introduce llist helpers
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (2 preceding siblings ...)
  2022-06-22 13:40 ` [PATCH v2 for-next 3/8] io_uring: lockless task list Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 5/8] io_uring: batch task_work Dylan Yudaken
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Introduce helpers to atomically switch llist.

Will later move this into common code

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 985b46dfebb6..eb29e3f7da5c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node,
 	} while (node);
 }
 
+/**
+ * io_llist_xchg - swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @new:	new entry as the head of the list
+ *
+ * If list is empty, return NULL, otherwise, return the pointer to the first entry.
+ * The order of entries returned is from the newest to the oldest added one.
+ */
+static inline struct llist_node *io_llist_xchg(struct llist_head *head,
+					       struct llist_node *node)
+{
+	return xchg(&head->first, node);
+}
+
+/**
+ * io_llist_xchg - possibly swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @old:	expected old value of the first entry of the list
+ * @new:	new entry as the head of the list
+ *
+ * perform a cmpxchg on the first entry of the list.
+ */
+
+static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
+						  struct llist_node *old,
+						  struct llist_node *new)
+{
+	return cmpxchg(&head->first, old, new);
+}
+
 void tctx_task_work(struct callback_head *cb)
 {
 	bool uring_locked = false;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 5/8] io_uring: batch task_work
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (3 preceding siblings ...)
  2022-06-22 13:40 ` [PATCH v2 for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Batching task work up is an important performance optimisation, as
task_work_add is expensive.

In order to keep the semantics replace the task_list with a fake node
while processing the old list, and then do a cmpxchg at the end to see if
there is more work.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eb29e3f7da5c..19bd7d5ec90c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-
 static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked)
+			   struct io_ring_ctx **ctx, bool *locked,
+			   struct llist_node *last)
 {
-	do {
+	while (node != last) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1006,7 +1006,7 @@ static void handle_tw_list(struct llist_node *node,
 		}
 		req->io_task_work.func(req, locked);
 		node = next;
-	} while (node);
+	}
 }
 
 /**
@@ -1045,11 +1045,15 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
-	struct llist_node *node = llist_del_all(&tctx->task_list);
-
-	if (node) {
-		handle_tw_list(node, &ctx, &uring_locked);
-		cond_resched();
+	struct llist_node fake = {};
+	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+
+	handle_tw_list(node, &ctx, &uring_locked, NULL);
+	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
+	while (node != &fake) {
+		node = io_llist_xchg(&tctx->task_list, &fake);
+		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (4 preceding siblings ...)
  2022-06-22 13:40 ` [PATCH v2 for-next 5/8] io_uring: batch task_work Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

The TP_printk macro's are not supposed to use custom code ([1]) or else
tools such as perf cannot use these events.

Convert the opcode string representation to use the __string wiring that
the event framework provides ([2]).

[1]: https://lwn.net/Articles/379903/
[2]: https://lwn.net/Articles/381064/

Fixes: 033b87d2 ("io_uring: use the text representation of ops in trace")
Signed-off-by: Dylan Yudaken <[email protected]>
---
 include/trace/events/io_uring.h | 42 +++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 5635912e1013..3bc8dec9acaa 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -151,6 +151,8 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  unsigned int,			flags		)
 		__field(  struct io_wq_work *,		work		)
 		__field(  int,				rw		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
 
 	TP_fast_assign(
@@ -161,11 +163,13 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__entry->opcode		= req->opcode;
 		__entry->work		= &req->work;
 		__entry->rw		= rw;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode),
+		__get_str(op_str),
 		__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
 );
 
@@ -188,6 +192,8 @@ TRACE_EVENT(io_uring_defer,
 		__field(  void *,		req	)
 		__field(  unsigned long long,	data	)
 		__field(  u8,			opcode	)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -195,11 +201,13 @@ TRACE_EVENT(io_uring_defer,
 		__entry->req	= req;
 		__entry->data	= req->cqe.user_data;
 		__entry->opcode	= req->opcode;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
 		__entry->ctx, __entry->req, __entry->data,
-		io_uring_get_opcode(__entry->opcode))
+		__get_str(op_str))
 );
 
 /**
@@ -284,6 +292,8 @@ TRACE_EVENT(io_uring_fail_link,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  void *,		link		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -292,11 +302,13 @@ TRACE_EVENT(io_uring_fail_link,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->link		= link;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode), __entry->link)
+		__get_str(op_str), __entry->link)
 );
 
 /**
@@ -370,6 +382,8 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__field(  u32,			flags		)
 		__field(  bool,			force_nonblock	)
 		__field(  bool,			sq_thread	)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -380,11 +394,13 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__entry->flags		= req->flags;
 		__entry->force_nonblock	= force_nonblock;
 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
 		  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
-		  __entry->user_data, io_uring_get_opcode(__entry->opcode),
+		  __entry->user_data, __get_str(op_str),
 		  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
 );
 
@@ -411,6 +427,8 @@ TRACE_EVENT(io_uring_poll_arm,
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
 		__field(  int,			events		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -420,11 +438,13 @@ TRACE_EVENT(io_uring_poll_arm,
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 		__entry->events		= events;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
 		  __entry->ctx, __entry->req, __entry->user_data,
-		  io_uring_get_opcode(__entry->opcode),
+		  __get_str(op_str),
 		  __entry->mask, __entry->events)
 );
 
@@ -447,6 +467,8 @@ TRACE_EVENT(io_uring_task_add,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -455,11 +477,13 @@ TRACE_EVENT(io_uring_task_add,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode),
+		__get_str(op_str),
 		__entry->mask)
 );
 
@@ -495,6 +519,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__field( u64,			pad1		)
 		__field( u64,			addr3		)
 		__field( int,			error		)
+
+		__string( op_str, io_uring_get_opcode(sqe->opcode) )
 	),
 
 	TP_fast_assign(
@@ -514,6 +540,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__entry->pad1		= sqe->__pad2[0];
 		__entry->addr3		= sqe->addr3;
 		__entry->error		= error;
+
+		__assign_str(op_str, io_uring_get_opcode(sqe->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, "
@@ -522,7 +550,7 @@ TRACE_EVENT(io_uring_req_failed,
 		  "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
 		  "error=%d",
 		  __entry->ctx, __entry->req, __entry->user_data,
-		  io_uring_get_opcode(__entry->opcode),
+		  __get_str(op_str),
 		  __entry->flags, __entry->ioprio,
 		  (unsigned long long)__entry->off,
 		  (unsigned long long) __entry->addr, __entry->len,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 7/8] io_uring: add trace event for running task work
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (5 preceding siblings ...)
  2022-06-22 13:40 ` [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 13:40 ` [PATCH v2 for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

This is useful for investigating if task_work is batching

Signed-off-by: Dylan Yudaken <[email protected]>
---
 include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaa..918e3a43e4b2 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		  __entry->cflags, __entry->ocqe)
 );
 
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx:		pointer to a io_uring_task
+ * @count:		how many functions it ran
+ * @loops:		how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+	TP_ARGS(tctx, count, loops),
+
+	TP_STRUCT__entry (
+		__field(  void *,		tctx		)
+		__field(  unsigned int,		count		)
+		__field(  unsigned int,		loops		)
+	),
+
+	TP_fast_assign(
+		__entry->tctx		= tctx;
+		__entry->count		= count;
+		__entry->loops		= loops;
+	),
+
+	TP_printk("tctx %p, count %u, loops %u",
+		 __entry->tctx, __entry->count, __entry->loops)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 for-next 8/8] io_uring: trace task_work_run
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (6 preceding siblings ...)
  2022-06-22 13:40 ` [PATCH v2 for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
  2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
  2022-06-22 17:39 ` Jens Axboe
  9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

trace task_work_run to help provide stats on how often task work is run
and what batch sizes are coming through.

Signed-off-by: Dylan Yudaken <[email protected]>
---
 io_uring/io_uring.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 19bd7d5ec90c..1b359249e933 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,10 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked,
-			   struct llist_node *last)
+static unsigned int handle_tw_list(struct llist_node *node,
+				   struct io_ring_ctx **ctx, bool *locked,
+				   struct llist_node *last)
 {
+	unsigned int count = 0;
+
 	while (node != last) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1006,7 +1008,10 @@ static void handle_tw_list(struct llist_node *node,
 		}
 		req->io_task_work.func(req, locked);
 		node = next;
+		count++;
 	}
+
+	return count;
 }
 
 /**
@@ -1047,12 +1052,14 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 	struct llist_node fake = {};
 	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+	unsigned int loops = 1;
+	unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL);
 
-	handle_tw_list(node, &ctx, &uring_locked, NULL);
 	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	while (node != &fake) {
+		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
-		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
@@ -1061,6 +1068,8 @@ void tctx_task_work(struct callback_head *cb)
 	/* relaxed read is enough as only the task itself sets ->in_idle */
 	if (unlikely(atomic_read(&tctx->in_idle)))
 		io_uring_drop_tctx_refs(current);
+
+	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
 void io_req_task_work_add(struct io_kiocb *req)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 for-next 0/8] io_uring: tw contention improvments
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (7 preceding siblings ...)
  2022-06-22 13:40 ` [PATCH v2 for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
@ 2022-06-22 15:21 ` Jens Axboe
  2022-06-23  8:23   ` Hao Xu
  2022-06-22 17:39 ` Jens Axboe
  9 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 15:21 UTC (permalink / raw)
  To: Dylan Yudaken, asml.silence, io-uring; +Cc: Kernel-team

On 6/22/22 7:40 AM, Dylan Yudaken wrote:
> Task work currently uses a spin lock to guard task_list and
> task_running. Some use cases such as networking can trigger task_work_add
> from multiple threads all at once, which suffers from contention here.
> 
> This can be changed to use a lockless list which seems to have better
> performance. Running the micro benchmark in [1] I see 20% improvment in
> multithreaded task work add. It required removing the priority tw list
> optimisation, however it isn't clear how important that optimisation is.
> Additionally it has fairly easy to break semantics.
> 
> Patch 1-2 remove the priority tw list optimisation
> Patch 3-5 add lockless lists for task work
> Patch 6 fixes a bug I noticed in io_uring event tracing
> Patch 7-8 adds tracing for task_work_run

I ran some IRQ driven workloads on this. Basic 512b random read, DIO,
IRQ, and then at queue depths 1-64, doubling every time. Once we get to
QD=8, start doing submit/complete batch of 1/4th of the QD so we ramp up
there too. Results below, first set is 5.19-rc3 + for-5.20/io_uring,
second set is that plus this series.

This is what I ran:

sudo taskset -c 12 t/io_uring -d<QD> -b512 -s<batch> -c<batch> -p0 -F1 -B1 -n1 -D0 -R0 -X1 -R1 -t1 -r5 /dev/nvme0n1

on a gen2 optane drive.

tldr - looks like an improvement there too, and no ill effects seen on
latency.

5.19-rc3 + for-5.20/io_uring:

QD1, Batch=1
Maximum IOPS=244K
1509: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3996],  5.0000th=[ 3996], 10.0000th=[ 3996],
     | 20.0000th=[ 4036], 30.0000th=[ 4036], 40.0000th=[ 4036],
     | 50.0000th=[ 4036], 60.0000th=[ 4036], 70.0000th=[ 4036],
     | 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4196],
     | 99.0000th=[ 4437], 99.5000th=[ 5421], 99.9000th=[ 7590],
     | 99.9500th=[ 9518], 99.9900th=[32289]

QD=2, Batch=1
Maximum IOPS=483K
1533: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3714],  5.0000th=[ 3755], 10.0000th=[ 3795],
     | 20.0000th=[ 3795], 30.0000th=[ 3835], 40.0000th=[ 3955],
     | 50.0000th=[ 4036], 60.0000th=[ 4076], 70.0000th=[ 4076],
     | 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4156],
     | 99.0000th=[ 4518], 99.5000th=[ 6144], 99.9000th=[ 7510],
     | 99.9500th=[ 9839], 99.9900th=[32289]

QD=4, Batch=1
Maximum IOPS=907K
1583: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3393],  5.0000th=[ 3514], 10.0000th=[ 3594],
     | 20.0000th=[ 3634], 30.0000th=[ 3795], 40.0000th=[ 3875],
     | 50.0000th=[ 3955], 60.0000th=[ 4076], 70.0000th=[ 4156],
     | 80.0000th=[ 4277], 90.0000th=[ 4397], 95.0000th=[ 4477],
     | 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9357],
     | 99.9500th=[11004], 99.9900th=[32289]

QD=8, Batch=2
Maximum IOPS=1688K
1631: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3353],  5.0000th=[ 3554], 10.0000th=[ 3634],
     | 20.0000th=[ 3755], 30.0000th=[ 3875], 40.0000th=[ 4036],
     | 50.0000th=[ 4156], 60.0000th=[ 4277], 70.0000th=[ 4437],
     | 80.0000th=[ 4678], 90.0000th=[ 4839], 95.0000th=[ 5040],
     | 99.0000th=[ 6305], 99.5000th=[ 7028], 99.9000th=[10080],
     | 99.9500th=[15502], 99.9900th=[32932]

QD=16, Batch=4
Maximum IOPS=2613K
1680: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3955],  5.0000th=[ 4397], 10.0000th=[ 4558],
     | 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
     | 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
     | 80.0000th=[ 5903], 90.0000th=[ 6305], 95.0000th=[ 6706],
     | 99.0000th=[ 8393], 99.5000th=[ 8955], 99.9000th=[11325],
     | 99.9500th=[31968], 99.9900th=[34217]

QD=32, Batch=8
Maximum IOPS=3573K
1706: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 4919],  5.0000th=[ 5662], 10.0000th=[ 5903],
     | 20.0000th=[ 6144], 30.0000th=[ 6465], 40.0000th=[ 6626],
     | 50.0000th=[ 6867], 60.0000th=[ 7188], 70.0000th=[ 7510],
     | 80.0000th=[ 7992], 90.0000th=[ 8714], 95.0000th=[ 9357],
     | 99.0000th=[11325], 99.5000th=[11967], 99.9000th=[16626],
     | 99.9500th=[34217], 99.9900th=[37108]

QD=64, Batch=16
Maximum IOPS=3953K
1735: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 6626],  5.0000th=[ 7188], 10.0000th=[ 7510],
     | 20.0000th=[ 7992], 30.0000th=[ 8393], 40.0000th=[ 9116],
     | 50.0000th=[10160], 60.0000th=[11164], 70.0000th=[11646],
     | 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
     | 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[34217],
     | 99.9500th=[38072], 99.9900th=[40964]


============


5.19-rc3 + for-5.20/io_uring + this series:

QD=1, Batch=1
Maximum IOPS=246K
909: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3955],  5.0000th=[ 3996], 10.0000th=[ 3996],
     | 20.0000th=[ 3996], 30.0000th=[ 3996], 40.0000th=[ 3996],
     | 50.0000th=[ 3996], 60.0000th=[ 3996], 70.0000th=[ 4036],
     | 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
     | 99.0000th=[ 4196], 99.5000th=[ 5341], 99.9000th=[ 7590],
     | 99.9500th=[ 9357], 99.9900th=[32289]

QD=2, Batch=1
Maximum IOPS=487K
932: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3714],  5.0000th=[ 3755], 10.0000th=[ 3755],
     | 20.0000th=[ 3755], 30.0000th=[ 3795], 40.0000th=[ 3795],
     | 50.0000th=[ 3996], 60.0000th=[ 4036], 70.0000th=[ 4036],
     | 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
     | 99.0000th=[ 4437], 99.5000th=[ 6224], 99.9000th=[ 7510],
     | 99.9500th=[ 9598], 99.9900th=[32289]

QD=4, Batch=1
aximum IOPS=921K
955: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3393],  5.0000th=[ 3433], 10.0000th=[ 3514],
     | 20.0000th=[ 3594], 30.0000th=[ 3674], 40.0000th=[ 3795],
     | 50.0000th=[ 3875], 60.0000th=[ 3996], 70.0000th=[ 4036],
     | 80.0000th=[ 4156], 90.0000th=[ 4317], 95.0000th=[ 4678],
     | 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9116],
     | 99.9500th=[10522], 99.9900th=[32289]

QD=8, Batch=2
Maximum IOPS=1658K
981: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3313],  5.0000th=[ 3514], 10.0000th=[ 3594],
     | 20.0000th=[ 3714], 30.0000th=[ 3835], 40.0000th=[ 3996],
     | 50.0000th=[ 4116], 60.0000th=[ 4196], 70.0000th=[ 4397],
     | 80.0000th=[ 4598], 90.0000th=[ 4718], 95.0000th=[ 4919],
     | 99.0000th=[ 6385], 99.5000th=[ 6947], 99.9000th=[10000],
     | 99.9500th=[15180], 99.9900th=[32932]

QD=16, Batch=4
Maximum IOPS=2749K
1010: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 3955],  5.0000th=[ 4437], 10.0000th=[ 4558],
     | 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
     | 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
     | 80.0000th=[ 5903], 90.0000th=[ 6224], 95.0000th=[ 6626],
     | 99.0000th=[ 8313], 99.5000th=[ 9036], 99.9000th=[11967],
     | 99.9500th=[32289], 99.9900th=[34217]

QD=32, Batch=8
Maximum IOPS=3583K
1050: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 4879],  5.0000th=[ 5582], 10.0000th=[ 5903],
     | 20.0000th=[ 6224], 30.0000th=[ 6465], 40.0000th=[ 6626],
     | 50.0000th=[ 6787], 60.0000th=[ 7028], 70.0000th=[ 7349],
     | 80.0000th=[ 7911], 90.0000th=[ 8634], 95.0000th=[ 9196],
     | 99.0000th=[11164], 99.5000th=[11967], 99.9000th=[16305],
     | 99.9500th=[34217], 99.9900th=[37108]

QD=64, Batch=16
Maximum IOPS=3959K
1081: Latency percentiles:
    percentiles (nsec):
     |  1.0000th=[ 6546],  5.0000th=[ 7108], 10.0000th=[ 7429],
     | 20.0000th=[ 7992], 30.0000th=[ 8313], 40.0000th=[ 8955],
     | 50.0000th=[10000], 60.0000th=[11004], 70.0000th=[11646],
     | 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
     | 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[33253],
     | 99.9500th=[38072], 99.9900th=[41446]

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 for-next 0/8] io_uring: tw contention improvments
  2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
@ 2022-06-23  8:23   ` Hao Xu
  0 siblings, 0 replies; 12+ messages in thread
From: Hao Xu @ 2022-06-23  8:23 UTC (permalink / raw)
  To: Jens Axboe, Dylan Yudaken, asml.silence, io-uring; +Cc: Kernel-team

On 6/22/22 23:21, Jens Axboe wrote:
> On 6/22/22 7:40 AM, Dylan Yudaken wrote:
>> Task work currently uses a spin lock to guard task_list and
>> task_running. Some use cases such as networking can trigger task_work_add
>> from multiple threads all at once, which suffers from contention here.
>>
>> This can be changed to use a lockless list which seems to have better
>> performance. Running the micro benchmark in [1] I see 20% improvment in
>> multithreaded task work add. It required removing the priority tw list
>> optimisation, however it isn't clear how important that optimisation is.
>> Additionally it has fairly easy to break semantics.
>>
>> Patch 1-2 remove the priority tw list optimisation
>> Patch 3-5 add lockless lists for task work
>> Patch 6 fixes a bug I noticed in io_uring event tracing
>> Patch 7-8 adds tracing for task_work_run
> 
> I ran some IRQ driven workloads on this. Basic 512b random read, DIO,
> IRQ, and then at queue depths 1-64, doubling every time. Once we get to
> QD=8, start doing submit/complete batch of 1/4th of the QD so we ramp up
> there too. Results below, first set is 5.19-rc3 + for-5.20/io_uring,
> second set is that plus this series.
> 
> This is what I ran:
> 
> sudo taskset -c 12 t/io_uring -d<QD> -b512 -s<batch> -c<batch> -p0 -F1 -B1 -n1 -D0 -R0 -X1 -R1 -t1 -r5 /dev/nvme0n1
> 
> on a gen2 optane drive.
> 
> tldr - looks like an improvement there too, and no ill effects seen on
> latency.

Looks so, nice.

> 
> 5.19-rc3 + for-5.20/io_uring:
> 
> QD1, Batch=1
> Maximum IOPS=244K
> 1509: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3996],  5.0000th=[ 3996], 10.0000th=[ 3996],
>       | 20.0000th=[ 4036], 30.0000th=[ 4036], 40.0000th=[ 4036],
>       | 50.0000th=[ 4036], 60.0000th=[ 4036], 70.0000th=[ 4036],
>       | 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4196],
>       | 99.0000th=[ 4437], 99.5000th=[ 5421], 99.9000th=[ 7590],
>       | 99.9500th=[ 9518], 99.9900th=[32289]
> 
> QD=2, Batch=1
> Maximum IOPS=483K
> 1533: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3714],  5.0000th=[ 3755], 10.0000th=[ 3795],
>       | 20.0000th=[ 3795], 30.0000th=[ 3835], 40.0000th=[ 3955],
>       | 50.0000th=[ 4036], 60.0000th=[ 4076], 70.0000th=[ 4076],
>       | 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4156],
>       | 99.0000th=[ 4518], 99.5000th=[ 6144], 99.9000th=[ 7510],
>       | 99.9500th=[ 9839], 99.9900th=[32289]
> 
> QD=4, Batch=1
> Maximum IOPS=907K
> 1583: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3393],  5.0000th=[ 3514], 10.0000th=[ 3594],
>       | 20.0000th=[ 3634], 30.0000th=[ 3795], 40.0000th=[ 3875],
>       | 50.0000th=[ 3955], 60.0000th=[ 4076], 70.0000th=[ 4156],
>       | 80.0000th=[ 4277], 90.0000th=[ 4397], 95.0000th=[ 4477],
>       | 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9357],
>       | 99.9500th=[11004], 99.9900th=[32289]
> 
> QD=8, Batch=2
> Maximum IOPS=1688K
> 1631: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3353],  5.0000th=[ 3554], 10.0000th=[ 3634],
>       | 20.0000th=[ 3755], 30.0000th=[ 3875], 40.0000th=[ 4036],
>       | 50.0000th=[ 4156], 60.0000th=[ 4277], 70.0000th=[ 4437],
>       | 80.0000th=[ 4678], 90.0000th=[ 4839], 95.0000th=[ 5040],
>       | 99.0000th=[ 6305], 99.5000th=[ 7028], 99.9000th=[10080],
>       | 99.9500th=[15502], 99.9900th=[32932]
> 
> QD=16, Batch=4
> Maximum IOPS=2613K
> 1680: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3955],  5.0000th=[ 4397], 10.0000th=[ 4558],
>       | 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
>       | 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
>       | 80.0000th=[ 5903], 90.0000th=[ 6305], 95.0000th=[ 6706],
>       | 99.0000th=[ 8393], 99.5000th=[ 8955], 99.9000th=[11325],
>       | 99.9500th=[31968], 99.9900th=[34217]
> 
> QD=32, Batch=8
> Maximum IOPS=3573K
> 1706: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 4919],  5.0000th=[ 5662], 10.0000th=[ 5903],
>       | 20.0000th=[ 6144], 30.0000th=[ 6465], 40.0000th=[ 6626],
>       | 50.0000th=[ 6867], 60.0000th=[ 7188], 70.0000th=[ 7510],
>       | 80.0000th=[ 7992], 90.0000th=[ 8714], 95.0000th=[ 9357],
>       | 99.0000th=[11325], 99.5000th=[11967], 99.9000th=[16626],
>       | 99.9500th=[34217], 99.9900th=[37108]
> 
> QD=64, Batch=16
> Maximum IOPS=3953K
> 1735: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 6626],  5.0000th=[ 7188], 10.0000th=[ 7510],
>       | 20.0000th=[ 7992], 30.0000th=[ 8393], 40.0000th=[ 9116],
>       | 50.0000th=[10160], 60.0000th=[11164], 70.0000th=[11646],
>       | 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
>       | 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[34217],
>       | 99.9500th=[38072], 99.9900th=[40964]
> 
> 
> ============
> 
> 
> 5.19-rc3 + for-5.20/io_uring + this series:
> 
> QD=1, Batch=1
> Maximum IOPS=246K
> 909: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3955],  5.0000th=[ 3996], 10.0000th=[ 3996],
>       | 20.0000th=[ 3996], 30.0000th=[ 3996], 40.0000th=[ 3996],
>       | 50.0000th=[ 3996], 60.0000th=[ 3996], 70.0000th=[ 4036],
>       | 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
>       | 99.0000th=[ 4196], 99.5000th=[ 5341], 99.9000th=[ 7590],
>       | 99.9500th=[ 9357], 99.9900th=[32289]
> 
> QD=2, Batch=1
> Maximum IOPS=487K
> 932: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3714],  5.0000th=[ 3755], 10.0000th=[ 3755],
>       | 20.0000th=[ 3755], 30.0000th=[ 3795], 40.0000th=[ 3795],
>       | 50.0000th=[ 3996], 60.0000th=[ 4036], 70.0000th=[ 4036],
>       | 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
>       | 99.0000th=[ 4437], 99.5000th=[ 6224], 99.9000th=[ 7510],
>       | 99.9500th=[ 9598], 99.9900th=[32289]
> 
> QD=4, Batch=1
> aximum IOPS=921K
> 955: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3393],  5.0000th=[ 3433], 10.0000th=[ 3514],
>       | 20.0000th=[ 3594], 30.0000th=[ 3674], 40.0000th=[ 3795],
>       | 50.0000th=[ 3875], 60.0000th=[ 3996], 70.0000th=[ 4036],
>       | 80.0000th=[ 4156], 90.0000th=[ 4317], 95.0000th=[ 4678],
>       | 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9116],
>       | 99.9500th=[10522], 99.9900th=[32289]
> 
> QD=8, Batch=2
> Maximum IOPS=1658K
> 981: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3313],  5.0000th=[ 3514], 10.0000th=[ 3594],
>       | 20.0000th=[ 3714], 30.0000th=[ 3835], 40.0000th=[ 3996],
>       | 50.0000th=[ 4116], 60.0000th=[ 4196], 70.0000th=[ 4397],
>       | 80.0000th=[ 4598], 90.0000th=[ 4718], 95.0000th=[ 4919],
>       | 99.0000th=[ 6385], 99.5000th=[ 6947], 99.9000th=[10000],
>       | 99.9500th=[15180], 99.9900th=[32932]
> 
> QD=16, Batch=4
> Maximum IOPS=2749K
> 1010: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 3955],  5.0000th=[ 4437], 10.0000th=[ 4558],
>       | 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
>       | 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
>       | 80.0000th=[ 5903], 90.0000th=[ 6224], 95.0000th=[ 6626],
>       | 99.0000th=[ 8313], 99.5000th=[ 9036], 99.9000th=[11967],
>       | 99.9500th=[32289], 99.9900th=[34217]
> 
> QD=32, Batch=8
> Maximum IOPS=3583K
> 1050: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 4879],  5.0000th=[ 5582], 10.0000th=[ 5903],
>       | 20.0000th=[ 6224], 30.0000th=[ 6465], 40.0000th=[ 6626],
>       | 50.0000th=[ 6787], 60.0000th=[ 7028], 70.0000th=[ 7349],
>       | 80.0000th=[ 7911], 90.0000th=[ 8634], 95.0000th=[ 9196],
>       | 99.0000th=[11164], 99.5000th=[11967], 99.9000th=[16305],
>       | 99.9500th=[34217], 99.9900th=[37108]
> 
> QD=64, Batch=16
> Maximum IOPS=3959K
> 1081: Latency percentiles:
>      percentiles (nsec):
>       |  1.0000th=[ 6546],  5.0000th=[ 7108], 10.0000th=[ 7429],
>       | 20.0000th=[ 7992], 30.0000th=[ 8313], 40.0000th=[ 8955],
>       | 50.0000th=[10000], 60.0000th=[11004], 70.0000th=[11646],
>       | 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
>       | 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[33253],
>       | 99.9500th=[38072], 99.9900th=[41446]
> 


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 for-next 0/8] io_uring: tw contention improvments
  2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (8 preceding siblings ...)
  2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
@ 2022-06-22 17:39 ` Jens Axboe
  9 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 17:39 UTC (permalink / raw)
  To: dylany, asml.silence, io-uring; +Cc: Kernel-team

On Wed, 22 Jun 2022 06:40:20 -0700, Dylan Yudaken wrote:
> Task work currently uses a spin lock to guard task_list and
> task_running. Some use cases such as networking can trigger task_work_add
> from multiple threads all at once, which suffers from contention here.
> 
> This can be changed to use a lockless list which seems to have better
> performance. Running the micro benchmark in [1] I see 20% improvment in
> multithreaded task work add. It required removing the priority tw list
> optimisation, however it isn't clear how important that optimisation is.
> Additionally it has fairly easy to break semantics.
> 
> [...]

Applied, thanks!

[1/8] io_uring: remove priority tw list optimisation
      commit: bb35381ea1b3980704809f1c13d7831989a9bc97
[2/8] io_uring: remove __io_req_task_work_add
      commit: fbfa4521091037bdfe499501d4c7ed175592ccd4
[3/8] io_uring: lockless task list
      commit: f032372c18b0730f551b8fa0a354ce2e84cfcbb7
[4/8] io_uring: introduce llist helpers
      commit: c0808632a83a7c607a987154372e705353acf4f2
[5/8] io_uring: batch task_work
      commit: 7afb384a25b0ed597defad431dcc83b5f509c98e
[6/8] io_uring: move io_uring_get_opcode out of TP_printk
      commit: 1da6baa4e4c290cebafec3341dbf3cbca21081b7
[7/8] io_uring: add trace event for running task work
      commit: d34b8ba25f0c3503f8766bd595c6d28e01cbbd54
[8/8] io_uring: trace task_work_run
      commit: e57a6f13bec58afe717894ce7fb7e6061c3fc2f4

Best regards,
-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-06-23  8:23 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 3/8] io_uring: lockless task list Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 5/8] io_uring: batch task_work Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
2022-06-23  8:23   ` Hao Xu
2022-06-22 17:39 ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox