* [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
` (8 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
This optimisation has some built in assumptions that make it easy to
introduce bugs. It also does not have clear wins that make it worth keeping.
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 77 +++++++--------------------------------------
io_uring/io_uring.h | 1 -
io_uring/rw.c | 2 +-
io_uring/tctx.c | 1 -
io_uring/tctx.h | 1 -
5 files changed, 12 insertions(+), 70 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index afda42246d12..cc524d33748d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs);
}
-static void handle_prev_tw_list(struct io_wq_work_node *node,
- struct io_ring_ctx **ctx, bool *uring_locked)
-{
- if (*ctx && !*uring_locked)
- spin_lock(&(*ctx)->completion_lock);
-
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- if (unlikely(!*uring_locked && *ctx))
- io_cq_unlock_post(*ctx);
-
- ctx_flush_and_put(*ctx, uring_locked);
- *ctx = req->ctx;
- /* if not contended, grab and improve batching */
- *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
- if (unlikely(!*uring_locked))
- io_cq_lock(*ctx);
- }
- if (likely(*uring_locked)) {
- req->io_task_work.func(req, uring_locked);
- } else {
- req->cqe.flags = io_put_kbuf_comp(req);
- __io_req_complete_post(req);
- }
- node = next;
- } while (node);
-
- if (unlikely(!*uring_locked))
- io_cq_unlock_post(*ctx);
-}
-
static void handle_tw_list(struct io_wq_work_node *node,
struct io_ring_ctx **ctx, bool *locked)
{
@@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb)
task_work);
while (1) {
- struct io_wq_work_node *node1, *node2;
+ struct io_wq_work_node *node;
spin_lock_irq(&tctx->task_lock);
- node1 = tctx->prio_task_list.first;
- node2 = tctx->task_list.first;
+ node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prio_task_list);
- if (!node2 && !node1)
+ if (!node)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
- if (!node2 && !node1)
+ if (!node)
break;
-
- if (node1)
- handle_prev_tw_list(node1, &ctx, &uring_locked);
- if (node2)
- handle_tw_list(node2, &ctx, &uring_locked);
+ handle_tw_list(node, &ctx, &uring_locked);
cond_resched();
- if (data_race(!tctx->task_list.first) &&
- data_race(!tctx->prio_task_list.first) && uring_locked)
+ if (data_race(!tctx->task_list.first) && uring_locked)
io_submit_flush_completions(ctx);
}
@@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb)
}
static void __io_req_task_work_add(struct io_kiocb *req,
- struct io_uring_task *tctx,
- struct io_wq_work_list *list)
+ struct io_uring_task *tctx)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node;
@@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
bool running;
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, list);
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req,
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
+ node = tctx->task_list.first;
+ INIT_WQ_LIST(&tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
- __io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-void io_req_task_prio_work_add(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = req->task->io_uring;
-
- if (req->ctx->flags & IORING_SETUP_SQPOLL)
- __io_req_task_work_add(req, tctx, &tctx->prio_task_list);
- else
- __io_req_task_work_add(req, tctx, &tctx->task_list);
+ __io_req_task_work_add(req, tctx);
}
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f026d2670959..f77e4a5403e4 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -36,7 +36,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index a308fc956114..e6cf1c3d8a29 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
return;
io_req_set_res(req, res, 0);
req->io_task_work.func = io_req_task_complete;
- io_req_task_prio_work_add(req);
+ io_req_task_work_add(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 9b30fb0d3603..7a68ba9beec3 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prio_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dead0ed00429..c8566ea5dca4 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -22,7 +22,6 @@ struct io_uring_task {
spinlock_t task_lock;
bool task_running;
struct io_wq_work_list task_list;
- struct io_wq_work_list prio_task_list;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 3/8] io_uring: lockless task list Dylan Yudaken
` (7 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
this is no longer needed as there is only one caller
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc524d33748d..e1523b62103b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb)
io_uring_drop_tctx_refs(current);
}
-static void __io_req_task_work_add(struct io_kiocb *req,
- struct io_uring_task *tctx)
+void io_req_task_work_add(struct io_kiocb *req)
{
+ struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node;
unsigned long flags;
@@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
}
}
-void io_req_task_work_add(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = req->task->io_uring;
-
- __io_req_task_work_add(req, tctx);
-}
-
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
{
io_req_complete_post(req);
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 3/8] io_uring: lockless task list
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
` (6 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.
Signed-off-by: Dylan Yudaken <[email protected]>
---
include/linux/io_uring_types.h | 2 +-
io_uring/io_uring.c | 38 ++++++++--------------------------
io_uring/tctx.c | 3 +--
io_uring/tctx.h | 6 +++---
4 files changed, 14 insertions(+), 35 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca38..918165a20053 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work {
union {
- struct io_wq_work_node node;
+ struct llist_node node;
struct llist_node fallback_node;
};
io_req_tw_func_t func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1523b62103b..985b46dfebb6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs);
}
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx, bool *locked)
{
do {
- struct io_wq_work_node *next = node->next;
+ struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
+ struct llist_node *node = llist_del_all(&tctx->task_list);
- while (1) {
- struct io_wq_work_node *node;
-
- spin_lock_irq(&tctx->task_lock);
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
- if (!node)
- tctx->task_running = false;
- spin_unlock_irq(&tctx->task_lock);
- if (!node)
- break;
+ if (node) {
handle_tw_list(node, &ctx, &uring_locked);
cond_resched();
-
- if (data_race(!tctx->task_list.first) && uring_locked)
- io_submit_flush_completions(ctx);
}
ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
- struct io_wq_work_node *node;
- unsigned long flags;
+ struct llist_node *node;
bool running;
- spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
- running = tctx->task_running;
- if (!running)
- tctx->task_running = true;
- spin_unlock_irqrestore(&tctx->task_lock, flags);
+ running = !llist_add(&req->io_task_work.node, &tctx->task_list);
/* task_work already pending, we're done */
if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
- spin_lock_irqsave(&tctx->task_lock, flags);
- tctx->task_running = false;
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
- spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+ node = llist_del_all(&tctx->task_list);
while (node) {
req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3..7f97d97fef0a 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
- spin_lock_init(&tctx->task_lock);
- INIT_WQ_LIST(&tctx->task_list);
+ init_llist_head(&tctx->task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca4..8a33ff6e5d91 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/llist.h>
+
/*
* Arbitrary limit, can be raised if need be
*/
@@ -19,9 +21,7 @@ struct io_uring_task {
struct percpu_counter inflight;
struct { /* task_work */
- spinlock_t task_lock;
- bool task_running;
- struct io_wq_work_list task_list;
+ struct llist_head task_list;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 4/8] io_uring: introduce llist helpers
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (2 preceding siblings ...)
2022-06-22 13:40 ` [PATCH v2 for-next 3/8] io_uring: lockless task list Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 5/8] io_uring: batch task_work Dylan Yudaken
` (5 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
Introduce helpers to atomically switch llist.
Will later move this into common code
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 985b46dfebb6..eb29e3f7da5c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node,
} while (node);
}
+/**
+ * io_llist_xchg - swap all entries in a lock-less list
+ * @head: the head of lock-less list to delete all entries
+ * @new: new entry as the head of the list
+ *
+ * If list is empty, return NULL, otherwise, return the pointer to the first entry.
+ * The order of entries returned is from the newest to the oldest added one.
+ */
+static inline struct llist_node *io_llist_xchg(struct llist_head *head,
+ struct llist_node *node)
+{
+ return xchg(&head->first, node);
+}
+
+/**
+ * io_llist_xchg - possibly swap all entries in a lock-less list
+ * @head: the head of lock-less list to delete all entries
+ * @old: expected old value of the first entry of the list
+ * @new: new entry as the head of the list
+ *
+ * perform a cmpxchg on the first entry of the list.
+ */
+
+static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
+ struct llist_node *old,
+ struct llist_node *new)
+{
+ return cmpxchg(&head->first, old, new);
+}
+
void tctx_task_work(struct callback_head *cb)
{
bool uring_locked = false;
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 5/8] io_uring: batch task_work
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (3 preceding siblings ...)
2022-06-22 13:40 ` [PATCH v2 for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
` (4 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
Batching task work up is an important performance optimisation, as
task_work_add is expensive.
In order to keep the semantics replace the task_list with a fake node
while processing the old list, and then do a cmpxchg at the end to see if
there is more work.
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 22 +++++++++++++---------
1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eb29e3f7da5c..19bd7d5ec90c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs);
}
-
static void handle_tw_list(struct llist_node *node,
- struct io_ring_ctx **ctx, bool *locked)
+ struct io_ring_ctx **ctx, bool *locked,
+ struct llist_node *last)
{
- do {
+ while (node != last) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1006,7 +1006,7 @@ static void handle_tw_list(struct llist_node *node,
}
req->io_task_work.func(req, locked);
node = next;
- } while (node);
+ }
}
/**
@@ -1045,11 +1045,15 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
- struct llist_node *node = llist_del_all(&tctx->task_list);
-
- if (node) {
- handle_tw_list(node, &ctx, &uring_locked);
- cond_resched();
+ struct llist_node fake = {};
+ struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+
+ handle_tw_list(node, &ctx, &uring_locked, NULL);
+ node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
+ while (node != &fake) {
+ node = io_llist_xchg(&tctx->task_list, &fake);
+ handle_tw_list(node, &ctx, &uring_locked, &fake);
+ node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
}
ctx_flush_and_put(ctx, &uring_locked);
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (4 preceding siblings ...)
2022-06-22 13:40 ` [PATCH v2 for-next 5/8] io_uring: batch task_work Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
` (3 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
The TP_printk macro's are not supposed to use custom code ([1]) or else
tools such as perf cannot use these events.
Convert the opcode string representation to use the __string wiring that
the event framework provides ([2]).
[1]: https://lwn.net/Articles/379903/
[2]: https://lwn.net/Articles/381064/
Fixes: 033b87d2 ("io_uring: use the text representation of ops in trace")
Signed-off-by: Dylan Yudaken <[email protected]>
---
include/trace/events/io_uring.h | 42 +++++++++++++++++++++++++++------
1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 5635912e1013..3bc8dec9acaa 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -151,6 +151,8 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( unsigned int, flags )
__field( struct io_wq_work *, work )
__field( int, rw )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -161,11 +163,13 @@ TRACE_EVENT(io_uring_queue_async_work,
__entry->opcode = req->opcode;
__entry->work = &req->work;
__entry->rw = rw;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
);
@@ -188,6 +192,8 @@ TRACE_EVENT(io_uring_defer,
__field( void *, req )
__field( unsigned long long, data )
__field( u8, opcode )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -195,11 +201,13 @@ TRACE_EVENT(io_uring_defer,
__entry->req = req;
__entry->data = req->cqe.user_data;
__entry->opcode = req->opcode;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
__entry->ctx, __entry->req, __entry->data,
- io_uring_get_opcode(__entry->opcode))
+ __get_str(op_str))
);
/**
@@ -284,6 +292,8 @@ TRACE_EVENT(io_uring_fail_link,
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( void *, link )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -292,11 +302,13 @@ TRACE_EVENT(io_uring_fail_link,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->link = link;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode), __entry->link)
+ __get_str(op_str), __entry->link)
);
/**
@@ -370,6 +382,8 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -380,11 +394,13 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
- __entry->user_data, io_uring_get_opcode(__entry->opcode),
+ __entry->user_data, __get_str(op_str),
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
);
@@ -411,6 +427,8 @@ TRACE_EVENT(io_uring_poll_arm,
__field( u8, opcode )
__field( int, mask )
__field( int, events )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -420,11 +438,13 @@ TRACE_EVENT(io_uring_poll_arm,
__entry->opcode = req->opcode;
__entry->mask = mask;
__entry->events = events;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->mask, __entry->events)
);
@@ -447,6 +467,8 @@ TRACE_EVENT(io_uring_task_add,
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( int, mask )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -455,11 +477,13 @@ TRACE_EVENT(io_uring_task_add,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->mask = mask;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->mask)
);
@@ -495,6 +519,8 @@ TRACE_EVENT(io_uring_req_failed,
__field( u64, pad1 )
__field( u64, addr3 )
__field( int, error )
+
+ __string( op_str, io_uring_get_opcode(sqe->opcode) )
),
TP_fast_assign(
@@ -514,6 +540,8 @@ TRACE_EVENT(io_uring_req_failed,
__entry->pad1 = sqe->__pad2[0];
__entry->addr3 = sqe->addr3;
__entry->error = error;
+
+ __assign_str(op_str, io_uring_get_opcode(sqe->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, "
@@ -522,7 +550,7 @@ TRACE_EVENT(io_uring_req_failed,
"personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
"error=%d",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->flags, __entry->ioprio,
(unsigned long long)__entry->off,
(unsigned long long) __entry->addr, __entry->len,
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 7/8] io_uring: add trace event for running task work
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (5 preceding siblings ...)
2022-06-22 13:40 ` [PATCH v2 for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 13:40 ` [PATCH v2 for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
` (2 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
This is useful for investigating if task_work is batching
Signed-off-by: Dylan Yudaken <[email protected]>
---
include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaa..918e3a43e4b2 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
__entry->cflags, __entry->ocqe)
);
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx: pointer to a io_uring_task
+ * @count: how many functions it ran
+ * @loops: how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+ TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+ TP_ARGS(tctx, count, loops),
+
+ TP_STRUCT__entry (
+ __field( void *, tctx )
+ __field( unsigned int, count )
+ __field( unsigned int, loops )
+ ),
+
+ TP_fast_assign(
+ __entry->tctx = tctx;
+ __entry->count = count;
+ __entry->loops = loops;
+ ),
+
+ TP_printk("tctx %p, count %u, loops %u",
+ __entry->tctx, __entry->count, __entry->loops)
+);
+
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 for-next 8/8] io_uring: trace task_work_run
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (6 preceding siblings ...)
2022-06-22 13:40 ` [PATCH v2 for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
@ 2022-06-22 13:40 ` Dylan Yudaken
2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
2022-06-22 17:39 ` Jens Axboe
9 siblings, 0 replies; 12+ messages in thread
From: Dylan Yudaken @ 2022-06-22 13:40 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
trace task_work_run to help provide stats on how often task work is run
and what batch sizes are coming through.
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 19bd7d5ec90c..1b359249e933 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,10 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs);
}
-static void handle_tw_list(struct llist_node *node,
- struct io_ring_ctx **ctx, bool *locked,
- struct llist_node *last)
+static unsigned int handle_tw_list(struct llist_node *node,
+ struct io_ring_ctx **ctx, bool *locked,
+ struct llist_node *last)
{
+ unsigned int count = 0;
+
while (node != last) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1006,7 +1008,10 @@ static void handle_tw_list(struct llist_node *node,
}
req->io_task_work.func(req, locked);
node = next;
+ count++;
}
+
+ return count;
}
/**
@@ -1047,12 +1052,14 @@ void tctx_task_work(struct callback_head *cb)
task_work);
struct llist_node fake = {};
struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+ unsigned int loops = 1;
+ unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL);
- handle_tw_list(node, &ctx, &uring_locked, NULL);
node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
while (node != &fake) {
+ loops++;
node = io_llist_xchg(&tctx->task_list, &fake);
- handle_tw_list(node, &ctx, &uring_locked, &fake);
+ count += handle_tw_list(node, &ctx, &uring_locked, &fake);
node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
}
@@ -1061,6 +1068,8 @@ void tctx_task_work(struct callback_head *cb)
/* relaxed read is enough as only the task itself sets ->in_idle */
if (unlikely(atomic_read(&tctx->in_idle)))
io_uring_drop_tctx_refs(current);
+
+ trace_io_uring_task_work_run(tctx, count, loops);
}
void io_req_task_work_add(struct io_kiocb *req)
--
2.30.2
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 for-next 0/8] io_uring: tw contention improvments
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (7 preceding siblings ...)
2022-06-22 13:40 ` [PATCH v2 for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
@ 2022-06-22 15:21 ` Jens Axboe
2022-06-23 8:23 ` Hao Xu
2022-06-22 17:39 ` Jens Axboe
9 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 15:21 UTC (permalink / raw)
To: Dylan Yudaken, asml.silence, io-uring; +Cc: Kernel-team
On 6/22/22 7:40 AM, Dylan Yudaken wrote:
> Task work currently uses a spin lock to guard task_list and
> task_running. Some use cases such as networking can trigger task_work_add
> from multiple threads all at once, which suffers from contention here.
>
> This can be changed to use a lockless list which seems to have better
> performance. Running the micro benchmark in [1] I see 20% improvment in
> multithreaded task work add. It required removing the priority tw list
> optimisation, however it isn't clear how important that optimisation is.
> Additionally it has fairly easy to break semantics.
>
> Patch 1-2 remove the priority tw list optimisation
> Patch 3-5 add lockless lists for task work
> Patch 6 fixes a bug I noticed in io_uring event tracing
> Patch 7-8 adds tracing for task_work_run
I ran some IRQ driven workloads on this. Basic 512b random read, DIO,
IRQ, and then at queue depths 1-64, doubling every time. Once we get to
QD=8, start doing submit/complete batch of 1/4th of the QD so we ramp up
there too. Results below, first set is 5.19-rc3 + for-5.20/io_uring,
second set is that plus this series.
This is what I ran:
sudo taskset -c 12 t/io_uring -d<QD> -b512 -s<batch> -c<batch> -p0 -F1 -B1 -n1 -D0 -R0 -X1 -R1 -t1 -r5 /dev/nvme0n1
on a gen2 optane drive.
tldr - looks like an improvement there too, and no ill effects seen on
latency.
5.19-rc3 + for-5.20/io_uring:
QD1, Batch=1
Maximum IOPS=244K
1509: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3996], 5.0000th=[ 3996], 10.0000th=[ 3996],
| 20.0000th=[ 4036], 30.0000th=[ 4036], 40.0000th=[ 4036],
| 50.0000th=[ 4036], 60.0000th=[ 4036], 70.0000th=[ 4036],
| 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4196],
| 99.0000th=[ 4437], 99.5000th=[ 5421], 99.9000th=[ 7590],
| 99.9500th=[ 9518], 99.9900th=[32289]
QD=2, Batch=1
Maximum IOPS=483K
1533: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3714], 5.0000th=[ 3755], 10.0000th=[ 3795],
| 20.0000th=[ 3795], 30.0000th=[ 3835], 40.0000th=[ 3955],
| 50.0000th=[ 4036], 60.0000th=[ 4076], 70.0000th=[ 4076],
| 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4156],
| 99.0000th=[ 4518], 99.5000th=[ 6144], 99.9000th=[ 7510],
| 99.9500th=[ 9839], 99.9900th=[32289]
QD=4, Batch=1
Maximum IOPS=907K
1583: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3393], 5.0000th=[ 3514], 10.0000th=[ 3594],
| 20.0000th=[ 3634], 30.0000th=[ 3795], 40.0000th=[ 3875],
| 50.0000th=[ 3955], 60.0000th=[ 4076], 70.0000th=[ 4156],
| 80.0000th=[ 4277], 90.0000th=[ 4397], 95.0000th=[ 4477],
| 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9357],
| 99.9500th=[11004], 99.9900th=[32289]
QD=8, Batch=2
Maximum IOPS=1688K
1631: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3353], 5.0000th=[ 3554], 10.0000th=[ 3634],
| 20.0000th=[ 3755], 30.0000th=[ 3875], 40.0000th=[ 4036],
| 50.0000th=[ 4156], 60.0000th=[ 4277], 70.0000th=[ 4437],
| 80.0000th=[ 4678], 90.0000th=[ 4839], 95.0000th=[ 5040],
| 99.0000th=[ 6305], 99.5000th=[ 7028], 99.9000th=[10080],
| 99.9500th=[15502], 99.9900th=[32932]
QD=16, Batch=4
Maximum IOPS=2613K
1680: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3955], 5.0000th=[ 4397], 10.0000th=[ 4558],
| 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
| 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
| 80.0000th=[ 5903], 90.0000th=[ 6305], 95.0000th=[ 6706],
| 99.0000th=[ 8393], 99.5000th=[ 8955], 99.9000th=[11325],
| 99.9500th=[31968], 99.9900th=[34217]
QD=32, Batch=8
Maximum IOPS=3573K
1706: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 4919], 5.0000th=[ 5662], 10.0000th=[ 5903],
| 20.0000th=[ 6144], 30.0000th=[ 6465], 40.0000th=[ 6626],
| 50.0000th=[ 6867], 60.0000th=[ 7188], 70.0000th=[ 7510],
| 80.0000th=[ 7992], 90.0000th=[ 8714], 95.0000th=[ 9357],
| 99.0000th=[11325], 99.5000th=[11967], 99.9000th=[16626],
| 99.9500th=[34217], 99.9900th=[37108]
QD=64, Batch=16
Maximum IOPS=3953K
1735: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 6626], 5.0000th=[ 7188], 10.0000th=[ 7510],
| 20.0000th=[ 7992], 30.0000th=[ 8393], 40.0000th=[ 9116],
| 50.0000th=[10160], 60.0000th=[11164], 70.0000th=[11646],
| 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
| 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[34217],
| 99.9500th=[38072], 99.9900th=[40964]
============
5.19-rc3 + for-5.20/io_uring + this series:
QD=1, Batch=1
Maximum IOPS=246K
909: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3955], 5.0000th=[ 3996], 10.0000th=[ 3996],
| 20.0000th=[ 3996], 30.0000th=[ 3996], 40.0000th=[ 3996],
| 50.0000th=[ 3996], 60.0000th=[ 3996], 70.0000th=[ 4036],
| 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
| 99.0000th=[ 4196], 99.5000th=[ 5341], 99.9000th=[ 7590],
| 99.9500th=[ 9357], 99.9900th=[32289]
QD=2, Batch=1
Maximum IOPS=487K
932: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3714], 5.0000th=[ 3755], 10.0000th=[ 3755],
| 20.0000th=[ 3755], 30.0000th=[ 3795], 40.0000th=[ 3795],
| 50.0000th=[ 3996], 60.0000th=[ 4036], 70.0000th=[ 4036],
| 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
| 99.0000th=[ 4437], 99.5000th=[ 6224], 99.9000th=[ 7510],
| 99.9500th=[ 9598], 99.9900th=[32289]
QD=4, Batch=1
aximum IOPS=921K
955: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3393], 5.0000th=[ 3433], 10.0000th=[ 3514],
| 20.0000th=[ 3594], 30.0000th=[ 3674], 40.0000th=[ 3795],
| 50.0000th=[ 3875], 60.0000th=[ 3996], 70.0000th=[ 4036],
| 80.0000th=[ 4156], 90.0000th=[ 4317], 95.0000th=[ 4678],
| 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9116],
| 99.9500th=[10522], 99.9900th=[32289]
QD=8, Batch=2
Maximum IOPS=1658K
981: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3313], 5.0000th=[ 3514], 10.0000th=[ 3594],
| 20.0000th=[ 3714], 30.0000th=[ 3835], 40.0000th=[ 3996],
| 50.0000th=[ 4116], 60.0000th=[ 4196], 70.0000th=[ 4397],
| 80.0000th=[ 4598], 90.0000th=[ 4718], 95.0000th=[ 4919],
| 99.0000th=[ 6385], 99.5000th=[ 6947], 99.9000th=[10000],
| 99.9500th=[15180], 99.9900th=[32932]
QD=16, Batch=4
Maximum IOPS=2749K
1010: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 3955], 5.0000th=[ 4437], 10.0000th=[ 4558],
| 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
| 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
| 80.0000th=[ 5903], 90.0000th=[ 6224], 95.0000th=[ 6626],
| 99.0000th=[ 8313], 99.5000th=[ 9036], 99.9000th=[11967],
| 99.9500th=[32289], 99.9900th=[34217]
QD=32, Batch=8
Maximum IOPS=3583K
1050: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 4879], 5.0000th=[ 5582], 10.0000th=[ 5903],
| 20.0000th=[ 6224], 30.0000th=[ 6465], 40.0000th=[ 6626],
| 50.0000th=[ 6787], 60.0000th=[ 7028], 70.0000th=[ 7349],
| 80.0000th=[ 7911], 90.0000th=[ 8634], 95.0000th=[ 9196],
| 99.0000th=[11164], 99.5000th=[11967], 99.9000th=[16305],
| 99.9500th=[34217], 99.9900th=[37108]
QD=64, Batch=16
Maximum IOPS=3959K
1081: Latency percentiles:
percentiles (nsec):
| 1.0000th=[ 6546], 5.0000th=[ 7108], 10.0000th=[ 7429],
| 20.0000th=[ 7992], 30.0000th=[ 8313], 40.0000th=[ 8955],
| 50.0000th=[10000], 60.0000th=[11004], 70.0000th=[11646],
| 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
| 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[33253],
| 99.9500th=[38072], 99.9900th=[41446]
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 for-next 0/8] io_uring: tw contention improvments
2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
@ 2022-06-23 8:23 ` Hao Xu
0 siblings, 0 replies; 12+ messages in thread
From: Hao Xu @ 2022-06-23 8:23 UTC (permalink / raw)
To: Jens Axboe, Dylan Yudaken, asml.silence, io-uring; +Cc: Kernel-team
On 6/22/22 23:21, Jens Axboe wrote:
> On 6/22/22 7:40 AM, Dylan Yudaken wrote:
>> Task work currently uses a spin lock to guard task_list and
>> task_running. Some use cases such as networking can trigger task_work_add
>> from multiple threads all at once, which suffers from contention here.
>>
>> This can be changed to use a lockless list which seems to have better
>> performance. Running the micro benchmark in [1] I see 20% improvment in
>> multithreaded task work add. It required removing the priority tw list
>> optimisation, however it isn't clear how important that optimisation is.
>> Additionally it has fairly easy to break semantics.
>>
>> Patch 1-2 remove the priority tw list optimisation
>> Patch 3-5 add lockless lists for task work
>> Patch 6 fixes a bug I noticed in io_uring event tracing
>> Patch 7-8 adds tracing for task_work_run
>
> I ran some IRQ driven workloads on this. Basic 512b random read, DIO,
> IRQ, and then at queue depths 1-64, doubling every time. Once we get to
> QD=8, start doing submit/complete batch of 1/4th of the QD so we ramp up
> there too. Results below, first set is 5.19-rc3 + for-5.20/io_uring,
> second set is that plus this series.
>
> This is what I ran:
>
> sudo taskset -c 12 t/io_uring -d<QD> -b512 -s<batch> -c<batch> -p0 -F1 -B1 -n1 -D0 -R0 -X1 -R1 -t1 -r5 /dev/nvme0n1
>
> on a gen2 optane drive.
>
> tldr - looks like an improvement there too, and no ill effects seen on
> latency.
Looks so, nice.
>
> 5.19-rc3 + for-5.20/io_uring:
>
> QD1, Batch=1
> Maximum IOPS=244K
> 1509: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3996], 5.0000th=[ 3996], 10.0000th=[ 3996],
> | 20.0000th=[ 4036], 30.0000th=[ 4036], 40.0000th=[ 4036],
> | 50.0000th=[ 4036], 60.0000th=[ 4036], 70.0000th=[ 4036],
> | 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4196],
> | 99.0000th=[ 4437], 99.5000th=[ 5421], 99.9000th=[ 7590],
> | 99.9500th=[ 9518], 99.9900th=[32289]
>
> QD=2, Batch=1
> Maximum IOPS=483K
> 1533: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3714], 5.0000th=[ 3755], 10.0000th=[ 3795],
> | 20.0000th=[ 3795], 30.0000th=[ 3835], 40.0000th=[ 3955],
> | 50.0000th=[ 4036], 60.0000th=[ 4076], 70.0000th=[ 4076],
> | 80.0000th=[ 4076], 90.0000th=[ 4116], 95.0000th=[ 4156],
> | 99.0000th=[ 4518], 99.5000th=[ 6144], 99.9000th=[ 7510],
> | 99.9500th=[ 9839], 99.9900th=[32289]
>
> QD=4, Batch=1
> Maximum IOPS=907K
> 1583: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3393], 5.0000th=[ 3514], 10.0000th=[ 3594],
> | 20.0000th=[ 3634], 30.0000th=[ 3795], 40.0000th=[ 3875],
> | 50.0000th=[ 3955], 60.0000th=[ 4076], 70.0000th=[ 4156],
> | 80.0000th=[ 4277], 90.0000th=[ 4397], 95.0000th=[ 4477],
> | 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9357],
> | 99.9500th=[11004], 99.9900th=[32289]
>
> QD=8, Batch=2
> Maximum IOPS=1688K
> 1631: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3353], 5.0000th=[ 3554], 10.0000th=[ 3634],
> | 20.0000th=[ 3755], 30.0000th=[ 3875], 40.0000th=[ 4036],
> | 50.0000th=[ 4156], 60.0000th=[ 4277], 70.0000th=[ 4437],
> | 80.0000th=[ 4678], 90.0000th=[ 4839], 95.0000th=[ 5040],
> | 99.0000th=[ 6305], 99.5000th=[ 7028], 99.9000th=[10080],
> | 99.9500th=[15502], 99.9900th=[32932]
>
> QD=16, Batch=4
> Maximum IOPS=2613K
> 1680: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3955], 5.0000th=[ 4397], 10.0000th=[ 4558],
> | 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
> | 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
> | 80.0000th=[ 5903], 90.0000th=[ 6305], 95.0000th=[ 6706],
> | 99.0000th=[ 8393], 99.5000th=[ 8955], 99.9000th=[11325],
> | 99.9500th=[31968], 99.9900th=[34217]
>
> QD=32, Batch=8
> Maximum IOPS=3573K
> 1706: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 4919], 5.0000th=[ 5662], 10.0000th=[ 5903],
> | 20.0000th=[ 6144], 30.0000th=[ 6465], 40.0000th=[ 6626],
> | 50.0000th=[ 6867], 60.0000th=[ 7188], 70.0000th=[ 7510],
> | 80.0000th=[ 7992], 90.0000th=[ 8714], 95.0000th=[ 9357],
> | 99.0000th=[11325], 99.5000th=[11967], 99.9000th=[16626],
> | 99.9500th=[34217], 99.9900th=[37108]
>
> QD=64, Batch=16
> Maximum IOPS=3953K
> 1735: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 6626], 5.0000th=[ 7188], 10.0000th=[ 7510],
> | 20.0000th=[ 7992], 30.0000th=[ 8393], 40.0000th=[ 9116],
> | 50.0000th=[10160], 60.0000th=[11164], 70.0000th=[11646],
> | 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
> | 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[34217],
> | 99.9500th=[38072], 99.9900th=[40964]
>
>
> ============
>
>
> 5.19-rc3 + for-5.20/io_uring + this series:
>
> QD=1, Batch=1
> Maximum IOPS=246K
> 909: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3955], 5.0000th=[ 3996], 10.0000th=[ 3996],
> | 20.0000th=[ 3996], 30.0000th=[ 3996], 40.0000th=[ 3996],
> | 50.0000th=[ 3996], 60.0000th=[ 3996], 70.0000th=[ 4036],
> | 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
> | 99.0000th=[ 4196], 99.5000th=[ 5341], 99.9000th=[ 7590],
> | 99.9500th=[ 9357], 99.9900th=[32289]
>
> QD=2, Batch=1
> Maximum IOPS=487K
> 932: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3714], 5.0000th=[ 3755], 10.0000th=[ 3755],
> | 20.0000th=[ 3755], 30.0000th=[ 3795], 40.0000th=[ 3795],
> | 50.0000th=[ 3996], 60.0000th=[ 4036], 70.0000th=[ 4036],
> | 80.0000th=[ 4036], 90.0000th=[ 4076], 95.0000th=[ 4116],
> | 99.0000th=[ 4437], 99.5000th=[ 6224], 99.9000th=[ 7510],
> | 99.9500th=[ 9598], 99.9900th=[32289]
>
> QD=4, Batch=1
> aximum IOPS=921K
> 955: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3393], 5.0000th=[ 3433], 10.0000th=[ 3514],
> | 20.0000th=[ 3594], 30.0000th=[ 3674], 40.0000th=[ 3795],
> | 50.0000th=[ 3875], 60.0000th=[ 3996], 70.0000th=[ 4036],
> | 80.0000th=[ 4156], 90.0000th=[ 4317], 95.0000th=[ 4678],
> | 99.0000th=[ 5120], 99.5000th=[ 5903], 99.9000th=[ 9116],
> | 99.9500th=[10522], 99.9900th=[32289]
>
> QD=8, Batch=2
> Maximum IOPS=1658K
> 981: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3313], 5.0000th=[ 3514], 10.0000th=[ 3594],
> | 20.0000th=[ 3714], 30.0000th=[ 3835], 40.0000th=[ 3996],
> | 50.0000th=[ 4116], 60.0000th=[ 4196], 70.0000th=[ 4397],
> | 80.0000th=[ 4598], 90.0000th=[ 4718], 95.0000th=[ 4919],
> | 99.0000th=[ 6385], 99.5000th=[ 6947], 99.9000th=[10000],
> | 99.9500th=[15180], 99.9900th=[32932]
>
> QD=16, Batch=4
> Maximum IOPS=2749K
> 1010: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 3955], 5.0000th=[ 4437], 10.0000th=[ 4558],
> | 20.0000th=[ 4759], 30.0000th=[ 4959], 40.0000th=[ 5120],
> | 50.0000th=[ 5261], 60.0000th=[ 5502], 70.0000th=[ 5743],
> | 80.0000th=[ 5903], 90.0000th=[ 6224], 95.0000th=[ 6626],
> | 99.0000th=[ 8313], 99.5000th=[ 9036], 99.9000th=[11967],
> | 99.9500th=[32289], 99.9900th=[34217]
>
> QD=32, Batch=8
> Maximum IOPS=3583K
> 1050: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 4879], 5.0000th=[ 5582], 10.0000th=[ 5903],
> | 20.0000th=[ 6224], 30.0000th=[ 6465], 40.0000th=[ 6626],
> | 50.0000th=[ 6787], 60.0000th=[ 7028], 70.0000th=[ 7349],
> | 80.0000th=[ 7911], 90.0000th=[ 8634], 95.0000th=[ 9196],
> | 99.0000th=[11164], 99.5000th=[11967], 99.9000th=[16305],
> | 99.9500th=[34217], 99.9900th=[37108]
>
> QD=64, Batch=16
> Maximum IOPS=3959K
> 1081: Latency percentiles:
> percentiles (nsec):
> | 1.0000th=[ 6546], 5.0000th=[ 7108], 10.0000th=[ 7429],
> | 20.0000th=[ 7992], 30.0000th=[ 8313], 40.0000th=[ 8955],
> | 50.0000th=[10000], 60.0000th=[11004], 70.0000th=[11646],
> | 80.0000th=[12128], 90.0000th=[12931], 95.0000th=[13735],
> | 99.0000th=[15984], 99.5000th=[16787], 99.9000th=[33253],
> | 99.9500th=[38072], 99.9900th=[41446]
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 for-next 0/8] io_uring: tw contention improvments
2022-06-22 13:40 [PATCH v2 for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (8 preceding siblings ...)
2022-06-22 15:21 ` [PATCH v2 for-next 0/8] io_uring: tw contention improvments Jens Axboe
@ 2022-06-22 17:39 ` Jens Axboe
9 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 17:39 UTC (permalink / raw)
To: dylany, asml.silence, io-uring; +Cc: Kernel-team
On Wed, 22 Jun 2022 06:40:20 -0700, Dylan Yudaken wrote:
> Task work currently uses a spin lock to guard task_list and
> task_running. Some use cases such as networking can trigger task_work_add
> from multiple threads all at once, which suffers from contention here.
>
> This can be changed to use a lockless list which seems to have better
> performance. Running the micro benchmark in [1] I see 20% improvment in
> multithreaded task work add. It required removing the priority tw list
> optimisation, however it isn't clear how important that optimisation is.
> Additionally it has fairly easy to break semantics.
>
> [...]
Applied, thanks!
[1/8] io_uring: remove priority tw list optimisation
commit: bb35381ea1b3980704809f1c13d7831989a9bc97
[2/8] io_uring: remove __io_req_task_work_add
commit: fbfa4521091037bdfe499501d4c7ed175592ccd4
[3/8] io_uring: lockless task list
commit: f032372c18b0730f551b8fa0a354ce2e84cfcbb7
[4/8] io_uring: introduce llist helpers
commit: c0808632a83a7c607a987154372e705353acf4f2
[5/8] io_uring: batch task_work
commit: 7afb384a25b0ed597defad431dcc83b5f509c98e
[6/8] io_uring: move io_uring_get_opcode out of TP_printk
commit: 1da6baa4e4c290cebafec3341dbf3cbca21081b7
[7/8] io_uring: add trace event for running task work
commit: d34b8ba25f0c3503f8766bd595c6d28e01cbbd54
[8/8] io_uring: trace task_work_run
commit: e57a6f13bec58afe717894ce7fb7e6061c3fc2f4
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread