* [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
` (7 subsequent siblings)
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
This optimisation has some built in assumptions that make it easy to
introduce bugs. It also does not have clear wins that make it worth keeping.
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 77 +++++++--------------------------------------
io_uring/io_uring.h | 1 -
io_uring/rw.c | 2 +-
io_uring/tctx.c | 1 -
io_uring/tctx.h | 1 -
5 files changed, 12 insertions(+), 70 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index afda42246d12..cc524d33748d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs);
}
-static void handle_prev_tw_list(struct io_wq_work_node *node,
- struct io_ring_ctx **ctx, bool *uring_locked)
-{
- if (*ctx && !*uring_locked)
- spin_lock(&(*ctx)->completion_lock);
-
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- if (unlikely(!*uring_locked && *ctx))
- io_cq_unlock_post(*ctx);
-
- ctx_flush_and_put(*ctx, uring_locked);
- *ctx = req->ctx;
- /* if not contended, grab and improve batching */
- *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
- if (unlikely(!*uring_locked))
- io_cq_lock(*ctx);
- }
- if (likely(*uring_locked)) {
- req->io_task_work.func(req, uring_locked);
- } else {
- req->cqe.flags = io_put_kbuf_comp(req);
- __io_req_complete_post(req);
- }
- node = next;
- } while (node);
-
- if (unlikely(!*uring_locked))
- io_cq_unlock_post(*ctx);
-}
-
static void handle_tw_list(struct io_wq_work_node *node,
struct io_ring_ctx **ctx, bool *locked)
{
@@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb)
task_work);
while (1) {
- struct io_wq_work_node *node1, *node2;
+ struct io_wq_work_node *node;
spin_lock_irq(&tctx->task_lock);
- node1 = tctx->prio_task_list.first;
- node2 = tctx->task_list.first;
+ node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prio_task_list);
- if (!node2 && !node1)
+ if (!node)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
- if (!node2 && !node1)
+ if (!node)
break;
-
- if (node1)
- handle_prev_tw_list(node1, &ctx, &uring_locked);
- if (node2)
- handle_tw_list(node2, &ctx, &uring_locked);
+ handle_tw_list(node, &ctx, &uring_locked);
cond_resched();
- if (data_race(!tctx->task_list.first) &&
- data_race(!tctx->prio_task_list.first) && uring_locked)
+ if (data_race(!tctx->task_list.first) && uring_locked)
io_submit_flush_completions(ctx);
}
@@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb)
}
static void __io_req_task_work_add(struct io_kiocb *req,
- struct io_uring_task *tctx,
- struct io_wq_work_list *list)
+ struct io_uring_task *tctx)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node;
@@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
bool running;
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, list);
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req,
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
+ node = tctx->task_list.first;
+ INIT_WQ_LIST(&tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
- __io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-void io_req_task_prio_work_add(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = req->task->io_uring;
-
- if (req->ctx->flags & IORING_SETUP_SQPOLL)
- __io_req_task_work_add(req, tctx, &tctx->prio_task_list);
- else
- __io_req_task_work_add(req, tctx, &tctx->task_list);
+ __io_req_task_work_add(req, tctx);
}
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7a00bbe85d35..864e552c76a5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -35,7 +35,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 9166d8166b82..caf72f50c341 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
return;
io_req_set_res(req, res, 0);
req->io_task_work.func = io_req_task_complete;
- io_req_task_prio_work_add(req);
+ io_req_task_work_add(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 9b30fb0d3603..7a68ba9beec3 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prio_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dead0ed00429..c8566ea5dca4 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -22,7 +22,6 @@ struct io_uring_task {
spinlock_t task_lock;
bool task_running;
struct io_wq_work_list task_list;
- struct io_wq_work_list prio_task_list;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
` (6 subsequent siblings)
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
this is no longer needed as there is only one caller
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc524d33748d..e1523b62103b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb)
io_uring_drop_tctx_refs(current);
}
-static void __io_req_task_work_add(struct io_kiocb *req,
- struct io_uring_task *tctx)
+void io_req_task_work_add(struct io_kiocb *req)
{
+ struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node;
unsigned long flags;
@@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
}
}
-void io_req_task_work_add(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = req->task->io_uring;
-
- __io_req_task_work_add(req, tctx);
-}
-
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
{
io_req_complete_post(req);
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 3/8] io_uring: lockless task list
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
` (5 subsequent siblings)
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.
Signed-off-by: Dylan Yudaken <[email protected]>
---
include/linux/io_uring_types.h | 2 +-
io_uring/io_uring.c | 38 ++++++++--------------------------
io_uring/tctx.c | 3 +--
io_uring/tctx.h | 6 +++---
4 files changed, 14 insertions(+), 35 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca38..918165a20053 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work {
union {
- struct io_wq_work_node node;
+ struct llist_node node;
struct llist_node fallback_node;
};
io_req_tw_func_t func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1523b62103b..985b46dfebb6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs);
}
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx, bool *locked)
{
do {
- struct io_wq_work_node *next = node->next;
+ struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
+ struct llist_node *node = llist_del_all(&tctx->task_list);
- while (1) {
- struct io_wq_work_node *node;
-
- spin_lock_irq(&tctx->task_lock);
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
- if (!node)
- tctx->task_running = false;
- spin_unlock_irq(&tctx->task_lock);
- if (!node)
- break;
+ if (node) {
handle_tw_list(node, &ctx, &uring_locked);
cond_resched();
-
- if (data_race(!tctx->task_list.first) && uring_locked)
- io_submit_flush_completions(ctx);
}
ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
- struct io_wq_work_node *node;
- unsigned long flags;
+ struct llist_node *node;
bool running;
- spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
- running = tctx->task_running;
- if (!running)
- tctx->task_running = true;
- spin_unlock_irqrestore(&tctx->task_lock, flags);
+ running = !llist_add(&req->io_task_work.node, &tctx->task_list);
/* task_work already pending, we're done */
if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
- spin_lock_irqsave(&tctx->task_lock, flags);
- tctx->task_running = false;
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
- spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+ node = llist_del_all(&tctx->task_list);
while (node) {
req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3..7f97d97fef0a 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
- spin_lock_init(&tctx->task_lock);
- INIT_WQ_LIST(&tctx->task_list);
+ init_llist_head(&tctx->task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca4..8a33ff6e5d91 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/llist.h>
+
/*
* Arbitrary limit, can be raised if need be
*/
@@ -19,9 +21,7 @@ struct io_uring_task {
struct percpu_counter inflight;
struct { /* task_work */
- spinlock_t task_lock;
- bool task_running;
- struct io_wq_work_list task_list;
+ struct llist_head task_list;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 4/8] io_uring: introduce llist helpers
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (2 preceding siblings ...)
2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
` (4 subsequent siblings)
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
Introduce helpers to atomically switch llist.
Will later move this into common code
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 985b46dfebb6..eb29e3f7da5c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node,
} while (node);
}
+/**
+ * io_llist_xchg - swap all entries in a lock-less list
+ * @head: the head of lock-less list to delete all entries
+ * @new: new entry as the head of the list
+ *
+ * If list is empty, return NULL, otherwise, return the pointer to the first entry.
+ * The order of entries returned is from the newest to the oldest added one.
+ */
+static inline struct llist_node *io_llist_xchg(struct llist_head *head,
+ struct llist_node *node)
+{
+ return xchg(&head->first, node);
+}
+
+/**
+ * io_llist_xchg - possibly swap all entries in a lock-less list
+ * @head: the head of lock-less list to delete all entries
+ * @old: expected old value of the first entry of the list
+ * @new: new entry as the head of the list
+ *
+ * perform a cmpxchg on the first entry of the list.
+ */
+
+static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
+ struct llist_node *old,
+ struct llist_node *new)
+{
+ return cmpxchg(&head->first, old, new);
+}
+
void tctx_task_work(struct callback_head *cb)
{
bool uring_locked = false;
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 5/8] io_uring: batch task_work
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (3 preceding siblings ...)
2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
` (3 subsequent siblings)
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
Batching task work up is an important performance optimisation, as
task_work_add is expensive.
In order to keep the semantics replace the task_list with a fake node
while processing the old list, and then do a cmpxchg at the end to see if
there is more work.
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 24 ++++++++++++++++--------
1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eb29e3f7da5c..0b4d3eb06d16 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -988,9 +988,10 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
static void handle_tw_list(struct llist_node *node,
- struct io_ring_ctx **ctx, bool *locked)
+ struct io_ring_ctx **ctx, bool *locked,
+ struct llist_node *fake)
{
- do {
+ while (node && node != fake) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
@@ -1006,7 +1007,10 @@ static void handle_tw_list(struct llist_node *node,
}
req->io_task_work.func(req, locked);
node = next;
- } while (node);
+ count++;
+ }
+
+ return count;
}
/**
@@ -1045,11 +1049,15 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
- struct llist_node *node = llist_del_all(&tctx->task_list);
-
- if (node) {
- handle_tw_list(node, &ctx, &uring_locked);
- cond_resched();
+ struct llist_node fake = {};
+ struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+
+ handle_tw_list(node, &ctx, &uring_locked, &fake);
+ node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
+ while (node != &fake) {
+ node = io_llist_xchg(&tctx->task_list, &fake);
+ handle_tw_list(node, &ctx, &uring_locked, &fake);
+ node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
}
ctx_flush_and_put(ctx, &uring_locked);
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (4 preceding siblings ...)
2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
` (2 subsequent siblings)
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
The TP_printk macro's are not supposed to use custom code ([1]) or else
tools such as perf cannot use these events.
Convert the opcode string representation to use the __string wiring that
the event framework provides ([2]).
[1]: https://lwn.net/Articles/379903/
[2]: https://lwn.net/Articles/381064/
Fixes: 033b87d2 ("io_uring: use the text representation of ops in trace")
Signed-off-by: Dylan Yudaken <[email protected]>
---
include/trace/events/io_uring.h | 42 +++++++++++++++++++++++++++------
1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 5635912e1013..3bc8dec9acaa 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -151,6 +151,8 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( unsigned int, flags )
__field( struct io_wq_work *, work )
__field( int, rw )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -161,11 +163,13 @@ TRACE_EVENT(io_uring_queue_async_work,
__entry->opcode = req->opcode;
__entry->work = &req->work;
__entry->rw = rw;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
);
@@ -188,6 +192,8 @@ TRACE_EVENT(io_uring_defer,
__field( void *, req )
__field( unsigned long long, data )
__field( u8, opcode )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -195,11 +201,13 @@ TRACE_EVENT(io_uring_defer,
__entry->req = req;
__entry->data = req->cqe.user_data;
__entry->opcode = req->opcode;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
__entry->ctx, __entry->req, __entry->data,
- io_uring_get_opcode(__entry->opcode))
+ __get_str(op_str))
);
/**
@@ -284,6 +292,8 @@ TRACE_EVENT(io_uring_fail_link,
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( void *, link )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -292,11 +302,13 @@ TRACE_EVENT(io_uring_fail_link,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->link = link;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode), __entry->link)
+ __get_str(op_str), __entry->link)
);
/**
@@ -370,6 +382,8 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -380,11 +394,13 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
- __entry->user_data, io_uring_get_opcode(__entry->opcode),
+ __entry->user_data, __get_str(op_str),
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
);
@@ -411,6 +427,8 @@ TRACE_EVENT(io_uring_poll_arm,
__field( u8, opcode )
__field( int, mask )
__field( int, events )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -420,11 +438,13 @@ TRACE_EVENT(io_uring_poll_arm,
__entry->opcode = req->opcode;
__entry->mask = mask;
__entry->events = events;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->mask, __entry->events)
);
@@ -447,6 +467,8 @@ TRACE_EVENT(io_uring_task_add,
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( int, mask )
+
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
@@ -455,11 +477,13 @@ TRACE_EVENT(io_uring_task_add,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->mask = mask;
+
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->mask)
);
@@ -495,6 +519,8 @@ TRACE_EVENT(io_uring_req_failed,
__field( u64, pad1 )
__field( u64, addr3 )
__field( int, error )
+
+ __string( op_str, io_uring_get_opcode(sqe->opcode) )
),
TP_fast_assign(
@@ -514,6 +540,8 @@ TRACE_EVENT(io_uring_req_failed,
__entry->pad1 = sqe->__pad2[0];
__entry->addr3 = sqe->addr3;
__entry->error = error;
+
+ __assign_str(op_str, io_uring_get_opcode(sqe->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, "
@@ -522,7 +550,7 @@ TRACE_EVENT(io_uring_req_failed,
"personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
"error=%d",
__entry->ctx, __entry->req, __entry->user_data,
- io_uring_get_opcode(__entry->opcode),
+ __get_str(op_str),
__entry->flags, __entry->ioprio,
(unsigned long long)__entry->off,
(unsigned long long) __entry->addr, __entry->len,
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 7/8] io_uring: add trace event for running task work
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (5 preceding siblings ...)
2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
@ 2022-06-20 16:19 ` Dylan Yudaken
2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
2022-06-21 5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:19 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
This is useful for investigating if task_work is batching
Signed-off-by: Dylan Yudaken <[email protected]>
---
include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaa..918e3a43e4b2 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
__entry->cflags, __entry->ocqe)
);
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx: pointer to a io_uring_task
+ * @count: how many functions it ran
+ * @loops: how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+ TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+ TP_ARGS(tctx, count, loops),
+
+ TP_STRUCT__entry (
+ __field( void *, tctx )
+ __field( unsigned int, count )
+ __field( unsigned int, loops )
+ ),
+
+ TP_fast_assign(
+ __entry->tctx = tctx;
+ __entry->count = count;
+ __entry->loops = loops;
+ ),
+
+ TP_printk("tctx %p, count %u, loops %u",
+ __entry->tctx, __entry->count, __entry->loops)
+);
+
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH RFC for-next 8/8] io_uring: trace task_work_run
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (6 preceding siblings ...)
2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
@ 2022-06-20 16:19 ` Dylan Yudaken
2022-06-21 5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:19 UTC (permalink / raw)
To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken
trace task_work_run to help provide stats on how often task work is run
and what batch sizes are coming through.
Signed-off-by: Dylan Yudaken <[email protected]>
---
io_uring/io_uring.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0b4d3eb06d16..8cf868315008 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -987,10 +987,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
}
-static void handle_tw_list(struct llist_node *node,
- struct io_ring_ctx **ctx, bool *locked,
- struct llist_node *fake)
+static unsigned int handle_tw_list(struct llist_node *node,
+ struct io_ring_ctx **ctx, bool *locked,
+ struct llist_node *fake)
{
+ unsigned int count = 0;
+
while (node && node != fake) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1051,12 +1053,14 @@ void tctx_task_work(struct callback_head *cb)
task_work);
struct llist_node fake = {};
struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+ unsigned int loops = 1;
+ unsigned int count = handle_tw_list(node, &ctx, &uring_locked, &fake);
- handle_tw_list(node, &ctx, &uring_locked, &fake);
node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
while (node != &fake) {
+ loops++;
node = io_llist_xchg(&tctx->task_list, &fake);
- handle_tw_list(node, &ctx, &uring_locked, &fake);
+ count += handle_tw_list(node, &ctx, &uring_locked, &fake);
node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
}
@@ -1065,6 +1069,8 @@ void tctx_task_work(struct callback_head *cb)
/* relaxed read is enough as only the task itself sets ->in_idle */
if (unlikely(atomic_read(&tctx->in_idle)))
io_uring_drop_tctx_refs(current);
+
+ trace_io_uring_task_work_run(tctx, count, loops);
}
void io_req_task_work_add(struct io_kiocb *req)
--
2.30.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
` (7 preceding siblings ...)
2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
@ 2022-06-21 5:10 ` Hao Xu
2022-06-21 7:03 ` Dylan Yudaken
8 siblings, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-21 5:10 UTC (permalink / raw)
To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel-team
On 6/21/22 00:18, Dylan Yudaken wrote:
> Task work currently uses a spin lock to guard task_list and
> task_running. Some use cases such as networking can trigger task_work_add
> from multiple threads all at once, which suffers from contention here.
>
> This can be changed to use a lockless list which seems to have better
> performance. Running the micro benchmark in [1] I see 20% improvment in
> multithreaded task work add. It required removing the priority tw list
> optimisation, however it isn't clear how important that optimisation is.
> Additionally it has fairly easy to break semantics.
>
> Patch 1-2 remove the priority tw list optimisation
> Patch 3-5 add lockless lists for task work
> Patch 6 fixes a bug I noticed in io_uring event tracing
> Patch 7-8 adds tracing for task_work_run
>
Compared to the spinlock overhead, the prio task list optimization is
definitely unimportant, so I agree with removing it here.
Replace the task list with llisy was something I considered but I gave
it up since it changes the list to a stack which means we have to handle
the tasks in a reverse order. This may affect the latency, do you have
some numbers for it, like avg and 99% 95% lat?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-21 5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
@ 2022-06-21 7:03 ` Dylan Yudaken
2022-06-21 7:34 ` Hao Xu
2022-06-21 7:38 ` Hao Xu
0 siblings, 2 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-21 7:03 UTC (permalink / raw)
To: [email protected], [email protected], [email protected],
[email protected]
Cc: Kernel Team
On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
> On 6/21/22 00:18, Dylan Yudaken wrote:
> > Task work currently uses a spin lock to guard task_list and
> > task_running. Some use cases such as networking can trigger
> > task_work_add
> > from multiple threads all at once, which suffers from contention
> > here.
> >
> > This can be changed to use a lockless list which seems to have
> > better
> > performance. Running the micro benchmark in [1] I see 20%
> > improvment in
> > multithreaded task work add. It required removing the priority tw
> > list
> > optimisation, however it isn't clear how important that
> > optimisation is.
> > Additionally it has fairly easy to break semantics.
> >
> > Patch 1-2 remove the priority tw list optimisation
> > Patch 3-5 add lockless lists for task work
> > Patch 6 fixes a bug I noticed in io_uring event tracing
> > Patch 7-8 adds tracing for task_work_run
> >
>
> Compared to the spinlock overhead, the prio task list optimization is
> definitely unimportant, so I agree with removing it here.
> Replace the task list with llisy was something I considered but I
> gave
> it up since it changes the list to a stack which means we have to
> handle
> the tasks in a reverse order. This may affect the latency, do you
> have
> some numbers for it, like avg and 99% 95% lat?
>
Do you have an idea for how to test that? I used a microbenchmark as
well as a network benchmark [1] to verify that overall throughput is
higher. TW latency sounds a lot more complicated to measure as it's
difficult to trigger accurately.
My feeling is that with reasonable batching (say 8-16 items) the
latency will be low as TW is generally very quick, but if you have an
idea for benchmarking I can take a look
[1]: https://github.com/DylanZA/netbench
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-21 7:03 ` Dylan Yudaken
@ 2022-06-21 7:34 ` Hao Xu
2022-06-22 9:31 ` Dylan Yudaken
2022-06-21 7:38 ` Hao Xu
1 sibling, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-21 7:34 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/21/22 15:03, Dylan Yudaken wrote:
> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>> Task work currently uses a spin lock to guard task_list and
>>> task_running. Some use cases such as networking can trigger
>>> task_work_add
>>> from multiple threads all at once, which suffers from contention
>>> here.
>>>
>>> This can be changed to use a lockless list which seems to have
>>> better
>>> performance. Running the micro benchmark in [1] I see 20%
>>> improvment in
>>> multithreaded task work add. It required removing the priority tw
>>> list
>>> optimisation, however it isn't clear how important that
>>> optimisation is.
>>> Additionally it has fairly easy to break semantics.
>>>
>>> Patch 1-2 remove the priority tw list optimisation
>>> Patch 3-5 add lockless lists for task work
>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>> Patch 7-8 adds tracing for task_work_run
>>>
>>
>> Compared to the spinlock overhead, the prio task list optimization is
>> definitely unimportant, so I agree with removing it here.
>> Replace the task list with llisy was something I considered but I
>> gave
>> it up since it changes the list to a stack which means we have to
>> handle
>> the tasks in a reverse order. This may affect the latency, do you
>> have
>> some numbers for it, like avg and 99% 95% lat?
>>
>
> Do you have an idea for how to test that? I used a microbenchmark as
> well as a network benchmark [1] to verify that overall throughput is
> higher. TW latency sounds a lot more complicated to measure as it's
> difficult to trigger accurately.
>
> My feeling is that with reasonable batching (say 8-16 items) the
> latency will be low as TW is generally very quick, but if you have an
> idea for benchmarking I can take a look
>
> [1]: https://github.com/DylanZA/netbench
It can be normal IO requests I think. We can test the latency by fio
with small size IO to a fast block device(like nvme) in SQPOLL
mode(since for non-SQPOLL, it doesn't make difference). This way we can
see the influence of reverse order handling.
Regards,
Hao
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-21 7:34 ` Hao Xu
@ 2022-06-22 9:31 ` Dylan Yudaken
2022-06-22 11:16 ` Hao Xu
0 siblings, 1 reply; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-22 9:31 UTC (permalink / raw)
To: [email protected], [email protected], [email protected],
[email protected]
Cc: Kernel Team
On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
> On 6/21/22 15:03, Dylan Yudaken wrote:
> > On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
> > > On 6/21/22 00:18, Dylan Yudaken wrote:
> > > > Task work currently uses a spin lock to guard task_list and
> > > > task_running. Some use cases such as networking can trigger
> > > > task_work_add
> > > > from multiple threads all at once, which suffers from
> > > > contention
> > > > here.
> > > >
> > > > This can be changed to use a lockless list which seems to have
> > > > better
> > > > performance. Running the micro benchmark in [1] I see 20%
> > > > improvment in
> > > > multithreaded task work add. It required removing the priority
> > > > tw
> > > > list
> > > > optimisation, however it isn't clear how important that
> > > > optimisation is.
> > > > Additionally it has fairly easy to break semantics.
> > > >
> > > > Patch 1-2 remove the priority tw list optimisation
> > > > Patch 3-5 add lockless lists for task work
> > > > Patch 6 fixes a bug I noticed in io_uring event tracing
> > > > Patch 7-8 adds tracing for task_work_run
> > > >
> > >
> > > Compared to the spinlock overhead, the prio task list
> > > optimization is
> > > definitely unimportant, so I agree with removing it here.
> > > Replace the task list with llisy was something I considered but I
> > > gave
> > > it up since it changes the list to a stack which means we have to
> > > handle
> > > the tasks in a reverse order. This may affect the latency, do you
> > > have
> > > some numbers for it, like avg and 99% 95% lat?
> > >
> >
> > Do you have an idea for how to test that? I used a microbenchmark
> > as
> > well as a network benchmark [1] to verify that overall throughput
> > is
> > higher. TW latency sounds a lot more complicated to measure as it's
> > difficult to trigger accurately.
> >
> > My feeling is that with reasonable batching (say 8-16 items) the
> > latency will be low as TW is generally very quick, but if you have
> > an
> > idea for benchmarking I can take a look
> >
> > [1]: https://github.com/DylanZA/netbench
>
> It can be normal IO requests I think. We can test the latency by fio
> with small size IO to a fast block device(like nvme) in SQPOLL
> mode(since for non-SQPOLL, it doesn't make difference). This way we
> can
> see the influence of reverse order handling.
>
> Regards,
> Hao
I see little difference locally, but there is quite a big stdev so it's
possible my test setup is a bit wonky
new:
clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
old:
clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-22 9:31 ` Dylan Yudaken
@ 2022-06-22 11:16 ` Hao Xu
2022-06-22 11:24 ` Hao Xu
0 siblings, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-22 11:16 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/22/22 17:31, Dylan Yudaken wrote:
> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>> Task work currently uses a spin lock to guard task_list and
>>>>> task_running. Some use cases such as networking can trigger
>>>>> task_work_add
>>>>> from multiple threads all at once, which suffers from
>>>>> contention
>>>>> here.
>>>>>
>>>>> This can be changed to use a lockless list which seems to have
>>>>> better
>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>> improvment in
>>>>> multithreaded task work add. It required removing the priority
>>>>> tw
>>>>> list
>>>>> optimisation, however it isn't clear how important that
>>>>> optimisation is.
>>>>> Additionally it has fairly easy to break semantics.
>>>>>
>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>> Patch 3-5 add lockless lists for task work
>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>
>>>>
>>>> Compared to the spinlock overhead, the prio task list
>>>> optimization is
>>>> definitely unimportant, so I agree with removing it here.
>>>> Replace the task list with llisy was something I considered but I
>>>> gave
>>>> it up since it changes the list to a stack which means we have to
>>>> handle
>>>> the tasks in a reverse order. This may affect the latency, do you
>>>> have
>>>> some numbers for it, like avg and 99% 95% lat?
>>>>
>>>
>>> Do you have an idea for how to test that? I used a microbenchmark
>>> as
>>> well as a network benchmark [1] to verify that overall throughput
>>> is
>>> higher. TW latency sounds a lot more complicated to measure as it's
>>> difficult to trigger accurately.
>>>
>>> My feeling is that with reasonable batching (say 8-16 items) the
>>> latency will be low as TW is generally very quick, but if you have
>>> an
>>> idea for benchmarking I can take a look
>>>
>>> [1]: https://github.com/DylanZA/netbench
>>
>> It can be normal IO requests I think. We can test the latency by fio
>> with small size IO to a fast block device(like nvme) in SQPOLL
>> mode(since for non-SQPOLL, it doesn't make difference). This way we
>> can
>> see the influence of reverse order handling.
>>
>> Regards,
>> Hao
>
> I see little difference locally, but there is quite a big stdev so it's
> possible my test setup is a bit wonky
>
> new:
> clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
> lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
> old:
> clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
> lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
>
Hi Dylan,
Could you post the arguments you use and the 99% 95% latency as well?
Regards,
Hao
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-22 11:16 ` Hao Xu
@ 2022-06-22 11:24 ` Hao Xu
2022-06-22 11:51 ` Dylan Yudaken
2022-06-22 11:52 ` Hao Xu
0 siblings, 2 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-22 11:24 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/22/22 19:16, Hao Xu wrote:
> On 6/22/22 17:31, Dylan Yudaken wrote:
>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>> Task work currently uses a spin lock to guard task_list and
>>>>>> task_running. Some use cases such as networking can trigger
>>>>>> task_work_add
>>>>>> from multiple threads all at once, which suffers from
>>>>>> contention
>>>>>> here.
>>>>>>
>>>>>> This can be changed to use a lockless list which seems to have
>>>>>> better
>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>> improvment in
>>>>>> multithreaded task work add. It required removing the priority
>>>>>> tw
>>>>>> list
>>>>>> optimisation, however it isn't clear how important that
>>>>>> optimisation is.
>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>
>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>> Patch 3-5 add lockless lists for task work
>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>
>>>>>
>>>>> Compared to the spinlock overhead, the prio task list
>>>>> optimization is
>>>>> definitely unimportant, so I agree with removing it here.
>>>>> Replace the task list with llisy was something I considered but I
>>>>> gave
>>>>> it up since it changes the list to a stack which means we have to
>>>>> handle
>>>>> the tasks in a reverse order. This may affect the latency, do you
>>>>> have
>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>
>>>>
>>>> Do you have an idea for how to test that? I used a microbenchmark
>>>> as
>>>> well as a network benchmark [1] to verify that overall throughput
>>>> is
>>>> higher. TW latency sounds a lot more complicated to measure as it's
>>>> difficult to trigger accurately.
>>>>
>>>> My feeling is that with reasonable batching (say 8-16 items) the
>>>> latency will be low as TW is generally very quick, but if you have
>>>> an
>>>> idea for benchmarking I can take a look
>>>>
>>>> [1]: https://github.com/DylanZA/netbench
>>>
>>> It can be normal IO requests I think. We can test the latency by fio
>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>> mode(since for non-SQPOLL, it doesn't make difference). This way we
>>> can
>>> see the influence of reverse order handling.
>>>
>>> Regards,
>>> Hao
>>
>> I see little difference locally, but there is quite a big stdev so it's
>> possible my test setup is a bit wonky
>>
>> new:
>> clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>> lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
>> old:
>> clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>> lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
>>
>
> Hi Dylan,
>
> Could you post the arguments you use and the 99% 95% latency as well?
>
> Regards,
> Hao
>
One thing I'm worrying about is under heavy workloads, there are
contiguous TWs coming in, thus the TWs at the end of the TW list doesn't
get the chance to run, which leads to the latency of those ones becoming
high.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-22 11:24 ` Hao Xu
@ 2022-06-22 11:51 ` Dylan Yudaken
2022-06-22 12:28 ` Hao Xu
2022-06-22 11:52 ` Hao Xu
1 sibling, 1 reply; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-22 11:51 UTC (permalink / raw)
To: [email protected], [email protected], [email protected],
[email protected]
Cc: Kernel Team
On Wed, 2022-06-22 at 19:24 +0800, Hao Xu wrote:
> On 6/22/22 19:16, Hao Xu wrote:
> > On 6/22/22 17:31, Dylan Yudaken wrote:
> > > On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
> > > > On 6/21/22 15:03, Dylan Yudaken wrote:
> > > > > On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
> > > > > > On 6/21/22 00:18, Dylan Yudaken wrote:
> > > > > > > Task work currently uses a spin lock to guard task_list
> > > > > > > and
> > > > > > > task_running. Some use cases such as networking can
> > > > > > > trigger
> > > > > > > task_work_add
> > > > > > > from multiple threads all at once, which suffers from
> > > > > > > contention
> > > > > > > here.
> > > > > > >
> > > > > > > This can be changed to use a lockless list which seems to
> > > > > > > have
> > > > > > > better
> > > > > > > performance. Running the micro benchmark in [1] I see 20%
> > > > > > > improvment in
> > > > > > > multithreaded task work add. It required removing the
> > > > > > > priority
> > > > > > > tw
> > > > > > > list
> > > > > > > optimisation, however it isn't clear how important that
> > > > > > > optimisation is.
> > > > > > > Additionally it has fairly easy to break semantics.
> > > > > > >
> > > > > > > Patch 1-2 remove the priority tw list optimisation
> > > > > > > Patch 3-5 add lockless lists for task work
> > > > > > > Patch 6 fixes a bug I noticed in io_uring event tracing
> > > > > > > Patch 7-8 adds tracing for task_work_run
> > > > > > >
> > > > > >
> > > > > > Compared to the spinlock overhead, the prio task list
> > > > > > optimization is
> > > > > > definitely unimportant, so I agree with removing it here.
> > > > > > Replace the task list with llisy was something I considered
> > > > > > but I
> > > > > > gave
> > > > > > it up since it changes the list to a stack which means we
> > > > > > have to
> > > > > > handle
> > > > > > the tasks in a reverse order. This may affect the latency,
> > > > > > do you
> > > > > > have
> > > > > > some numbers for it, like avg and 99% 95% lat?
> > > > > >
> > > > >
> > > > > Do you have an idea for how to test that? I used a
> > > > > microbenchmark
> > > > > as
> > > > > well as a network benchmark [1] to verify that overall
> > > > > throughput
> > > > > is
> > > > > higher. TW latency sounds a lot more complicated to measure
> > > > > as it's
> > > > > difficult to trigger accurately.
> > > > >
> > > > > My feeling is that with reasonable batching (say 8-16 items)
> > > > > the
> > > > > latency will be low as TW is generally very quick, but if you
> > > > > have
> > > > > an
> > > > > idea for benchmarking I can take a look
> > > > >
> > > > > [1]: https://github.com/DylanZA/netbench
> > > >
> > > > It can be normal IO requests I think. We can test the latency
> > > > by fio
> > > > with small size IO to a fast block device(like nvme) in SQPOLL
> > > > mode(since for non-SQPOLL, it doesn't make difference). This
> > > > way we
> > > > can
> > > > see the influence of reverse order handling.
> > > >
> > > > Regards,
> > > > Hao
> > >
> > > I see little difference locally, but there is quite a big stdev
> > > so it's
> > > possible my test setup is a bit wonky
> > >
> > > new:
> > > clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
> > > lat (nsec): min=1440, max=16719k, avg=119714.72,
> > > stdev=153571.49
> > > old:
> > > clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
> > > lat (nsec): min=1278, max=16610k, avg=121025.73,
> > > stdev=211896.14
> > >
> >
> > Hi Dylan,
> >
> > Could you post the arguments you use and the 99% 95% latency as
> > well?
> >
> > Regards,
> > Hao
> >
>
> One thing I'm worrying about is under heavy workloads, there are
> contiguous TWs coming in, thus the TWs at the end of the TW list
> doesn't
> get the chance to run, which leads to the latency of those ones
> becoming
> high.
Pavel mentioned I should change some arguments, so I reran it. I'll
just post all the output below as not sure exactly what you are looking
for. Note I checked that it was definitely batching and it is doing
batches of 10-20 in tctx_task_work
*** config ***
[global]
ioengine=io_uring
sqthread_poll=1
registerfiles=1
fixedbufs=1
hipri=0
thread=1
direct=0
rw=randread
time_based=1
runtime=600
ramp_time=30
randrepeat=0
group_reporting=0
sqthread_poll_cpu=15
iodepth=32
[job0]
filename=/dev/nullb0
cpus_allowed=1
bs=512
*** new ***
job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
512B, ioengine=io_uring, iodepth=32
fio-3.30-59-gd4bf5
Starting 1 thread
Jobs: 1 (f=0): [f(1)][100.0%][r=360MiB/s][r=738k IOPS][eta 00m:00s]
job0: (groupid=0, jobs=1): err= 0: pid=37255: Wed Jun 22 03:44:23 2022
read: IOPS=596k, BW=291MiB/s (305MB/s)(171GiB/600001msec)
clat (msec): min=30343, max=630343, avg=369885.75, stdev=164921.26
lat (usec): min=14, max=1802, avg=53.23, stdev=18.84
clat percentiles (msec):
| 1.00th=[17113], 5.00th=[17113], 10.00th=[17113],
20.00th=[17113],
| 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
60.00th=[17113],
| 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
95.00th=[17113],
| 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
99.95th=[17113],
| 99.99th=[17113]
bw ( KiB/s): min=169237, max=381603, per=100.00%, avg=298171.22,
stdev=70580.65, samples=1199
iops : min=338474, max=763206, avg=596342.60,
stdev=141161.31, samples=1199
lat (msec) : >=2000=100.00%
cpu : usr=99.98%, sys=0.00%, ctx=4378, majf=0, minf=9
IO depths : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>=64=0.0%
submit : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>=64=0.0%
issued rwts: total=357661967,0,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=32
Run status group 0 (all jobs):
READ: bw=291MiB/s (305MB/s), 291MiB/s-291MiB/s (305MB/s-305MB/s),
io=171GiB (183GB), run=600001-600001msec
Disk stats (read/write):
nullb0: ios=72127555/0, merge=11/0, ticks=1396298/0,
in_queue=1396298, util=100.00%
*** old ***
job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
512B, ioengine=io_uring, iodepth=32
fio-3.30-59-gd4bf5
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=367MiB/s][r=751k IOPS][eta 00m:00s]
job0: (groupid=0, jobs=1): err= 0: pid=19216: Wed Jun 22 04:43:36 2022
read: IOPS=609k, BW=297MiB/s (312MB/s)(174GiB/600001msec)
clat (msec): min=30333, max=630333, avg=368961.53, stdev=164532.01
lat (usec): min=14, max=5830, avg=52.11, stdev=18.64
clat percentiles (msec):
| 1.00th=[17113], 5.00th=[17113], 10.00th=[17113],
20.00th=[17113],
| 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
60.00th=[17113],
| 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
95.00th=[17113],
| 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
99.95th=[17113],
| 99.99th=[17113]
bw ( KiB/s): min=170273, max=386932, per=100.00%, avg=304548.39,
stdev=70732.20, samples=1200
iops : min=340547, max=773864, avg=609096.94,
stdev=141464.41, samples=1200
lat (msec) : >=2000=100.00%
cpu : usr=99.98%, sys=0.00%, ctx=3912, majf=0, minf=5
IO depths : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>=64=0.0%
submit : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>=64=0.0%
issued rwts: total=365258392,0,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=32
Run status group 0 (all jobs):
READ: bw=297MiB/s (312MB/s), 297MiB/s-297MiB/s (312MB/s-312MB/s),
io=174GiB (187GB), run=600001-600001msec
Disk stats (read/write):
nullb0: ios=69031421/0, merge=1/0, ticks=1323086/0, in_queue=1323086,
util=100.00%
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-22 11:51 ` Dylan Yudaken
@ 2022-06-22 12:28 ` Hao Xu
2022-06-22 12:29 ` Hao Xu
0 siblings, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-22 12:28 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/22/22 19:51, Dylan Yudaken wrote:
> On Wed, 2022-06-22 at 19:24 +0800, Hao Xu wrote:
>> On 6/22/22 19:16, Hao Xu wrote:
>>> On 6/22/22 17:31, Dylan Yudaken wrote:
>>>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>>>> Task work currently uses a spin lock to guard task_list
>>>>>>>> and
>>>>>>>> task_running. Some use cases such as networking can
>>>>>>>> trigger
>>>>>>>> task_work_add
>>>>>>>> from multiple threads all at once, which suffers from
>>>>>>>> contention
>>>>>>>> here.
>>>>>>>>
>>>>>>>> This can be changed to use a lockless list which seems to
>>>>>>>> have
>>>>>>>> better
>>>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>>>> improvment in
>>>>>>>> multithreaded task work add. It required removing the
>>>>>>>> priority
>>>>>>>> tw
>>>>>>>> list
>>>>>>>> optimisation, however it isn't clear how important that
>>>>>>>> optimisation is.
>>>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>>>
>>>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>>>> Patch 3-5 add lockless lists for task work
>>>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>>>
>>>>>>>
>>>>>>> Compared to the spinlock overhead, the prio task list
>>>>>>> optimization is
>>>>>>> definitely unimportant, so I agree with removing it here.
>>>>>>> Replace the task list with llisy was something I considered
>>>>>>> but I
>>>>>>> gave
>>>>>>> it up since it changes the list to a stack which means we
>>>>>>> have to
>>>>>>> handle
>>>>>>> the tasks in a reverse order. This may affect the latency,
>>>>>>> do you
>>>>>>> have
>>>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>>>
>>>>>>
>>>>>> Do you have an idea for how to test that? I used a
>>>>>> microbenchmark
>>>>>> as
>>>>>> well as a network benchmark [1] to verify that overall
>>>>>> throughput
>>>>>> is
>>>>>> higher. TW latency sounds a lot more complicated to measure
>>>>>> as it's
>>>>>> difficult to trigger accurately.
>>>>>>
>>>>>> My feeling is that with reasonable batching (say 8-16 items)
>>>>>> the
>>>>>> latency will be low as TW is generally very quick, but if you
>>>>>> have
>>>>>> an
>>>>>> idea for benchmarking I can take a look
>>>>>>
>>>>>> [1]: https://github.com/DylanZA/netbench
>>>>>
>>>>> It can be normal IO requests I think. We can test the latency
>>>>> by fio
>>>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>>>> mode(since for non-SQPOLL, it doesn't make difference). This
>>>>> way we
>>>>> can
>>>>> see the influence of reverse order handling.
>>>>>
>>>>> Regards,
>>>>> Hao
>>>>
>>>> I see little difference locally, but there is quite a big stdev
>>>> so it's
>>>> possible my test setup is a bit wonky
>>>>
>>>> new:
>>>> clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>>> lat (nsec): min=1440, max=16719k, avg=119714.72,
>>>> stdev=153571.49
>>>> old:
>>>> clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>>> lat (nsec): min=1278, max=16610k, avg=121025.73,
>>>> stdev=211896.14
>>>>
>>>
>>> Hi Dylan,
>>>
>>> Could you post the arguments you use and the 99% 95% latency as
>>> well?
>>>
>>> Regards,
>>> Hao
>>>
>>
>> One thing I'm worrying about is under heavy workloads, there are
>> contiguous TWs coming in, thus the TWs at the end of the TW list
>> doesn't
>> get the chance to run, which leads to the latency of those ones
>> becoming
>> high.
>
> Pavel mentioned I should change some arguments, so I reran it. I'll
> just post all the output below as not sure exactly what you are looking
> for. Note I checked that it was definitely batching and it is doing
> batches of 10-20 in tctx_task_work
>
>
> *** config ***
>
> [global]
> ioengine=io_uring
> sqthread_poll=1
> registerfiles=1
> fixedbufs=1
> hipri=0
> thread=1
> direct=0
> rw=randread
> time_based=1
> runtime=600
> ramp_time=30
> randrepeat=0
> group_reporting=0
> sqthread_poll_cpu=15
> iodepth=32
>
> [job0]
> filename=/dev/nullb0
> cpus_allowed=1
> bs=512
>
> *** new ***
> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
> 512B, ioengine=io_uring, iodepth=32
> fio-3.30-59-gd4bf5
> Starting 1 thread
> Jobs: 1 (f=0): [f(1)][100.0%][r=360MiB/s][r=738k IOPS][eta 00m:00s]
> job0: (groupid=0, jobs=1): err= 0: pid=37255: Wed Jun 22 03:44:23 2022
> read: IOPS=596k, BW=291MiB/s (305MB/s)(171GiB/600001msec)
> clat (msec): min=30343, max=630343, avg=369885.75, stdev=164921.26
> lat (usec): min=14, max=1802, avg=53.23, stdev=18.84
> clat percentiles (msec):
> | 1.00th=[17113], 5.00th=[17113], 10.00th=[17113],
> 20.00th=[17113],
> | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
> 60.00th=[17113],
> | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
> 95.00th=[17113],
> | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
> 99.95th=[17113],
> | 99.99th=[17113]
> bw ( KiB/s): min=169237, max=381603, per=100.00%, avg=298171.22,
> stdev=70580.65, samples=1199
> iops : min=338474, max=763206, avg=596342.60,
> stdev=141161.31, samples=1199
> lat (msec) : >=2000=100.00%
> cpu : usr=99.98%, sys=0.00%, ctx=4378, majf=0, minf=9
> IO depths : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>> =64=0.0%
> submit : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>> =64=0.0%
> complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>> =64=0.0%
> issued rwts: total=357661967,0,0,0 short=0,0,0,0 dropped=0,0,0,0
> latency : target=0, window=0, percentile=100.00%, depth=32
>
> Run status group 0 (all jobs):
> READ: bw=291MiB/s (305MB/s), 291MiB/s-291MiB/s (305MB/s-305MB/s),
> io=171GiB (183GB), run=600001-600001msec
>
> Disk stats (read/write):
> nullb0: ios=72127555/0, merge=11/0, ticks=1396298/0,
> in_queue=1396298, util=100.00%
>
> *** old ***
>
> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
> 512B, ioengine=io_uring, iodepth=32
> fio-3.30-59-gd4bf5
> Starting 1 thread
> Jobs: 1 (f=1): [r(1)][100.0%][r=367MiB/s][r=751k IOPS][eta 00m:00s]
> job0: (groupid=0, jobs=1): err= 0: pid=19216: Wed Jun 22 04:43:36 2022
> read: IOPS=609k, BW=297MiB/s (312MB/s)(174GiB/600001msec)
> clat (msec): min=30333, max=630333, avg=368961.53, stdev=164532.01
> lat (usec): min=14, max=5830, avg=52.11, stdev=18.64
> clat percentiles (msec):
> | 1.00th=[17113], 5.00th=[17113], 10.00th=[17113],
> 20.00th=[17113],
> | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
> 60.00th=[17113],
> | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
> 95.00th=[17113],
> | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
> 99.95th=[17113],
> | 99.99th=[17113]
> bw ( KiB/s): min=170273, max=386932, per=100.00%, avg=304548.39,
> stdev=70732.20, samples=1200
> iops : min=340547, max=773864, avg=609096.94,
> stdev=141464.41, samples=1200
> lat (msec) : >=2000=100.00%
> cpu : usr=99.98%, sys=0.00%, ctx=3912, majf=0, minf=5
> IO depths : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>> =64=0.0%
> submit : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>> =64=0.0%
> complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>> =64=0.0%
> issued rwts: total=365258392,0,0,0 short=0,0,0,0 dropped=0,0,0,0
> latency : target=0, window=0, percentile=100.00%, depth=32
>
> Run status group 0 (all jobs):
> READ: bw=297MiB/s (312MB/s), 297MiB/s-297MiB/s (312MB/s-312MB/s),
> io=174GiB (187GB), run=600001-600001msec
>
> Disk stats (read/write):
> nullb0: ios=69031421/0, merge=1/0, ticks=1323086/0, in_queue=1323086,
> util=100.00%
>
Ok, the clat percentiles seems meanless here... from the min max and avg
data it should be fine. Thanks for testing.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-22 12:28 ` Hao Xu
@ 2022-06-22 12:29 ` Hao Xu
0 siblings, 0 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-22 12:29 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/22/22 20:28, Hao Xu wrote:
> On 6/22/22 19:51, Dylan Yudaken wrote:
>> On Wed, 2022-06-22 at 19:24 +0800, Hao Xu wrote:
>>> On 6/22/22 19:16, Hao Xu wrote:
>>>> On 6/22/22 17:31, Dylan Yudaken wrote:
>>>>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>>>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>>>>> Task work currently uses a spin lock to guard task_list
>>>>>>>>> and
>>>>>>>>> task_running. Some use cases such as networking can
>>>>>>>>> trigger
>>>>>>>>> task_work_add
>>>>>>>>> from multiple threads all at once, which suffers from
>>>>>>>>> contention
>>>>>>>>> here.
>>>>>>>>>
>>>>>>>>> This can be changed to use a lockless list which seems to
>>>>>>>>> have
>>>>>>>>> better
>>>>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>>>>> improvment in
>>>>>>>>> multithreaded task work add. It required removing the
>>>>>>>>> priority
>>>>>>>>> tw
>>>>>>>>> list
>>>>>>>>> optimisation, however it isn't clear how important that
>>>>>>>>> optimisation is.
>>>>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>>>>
>>>>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>>>>> Patch 3-5 add lockless lists for task work
>>>>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>>>>
>>>>>>>>
>>>>>>>> Compared to the spinlock overhead, the prio task list
>>>>>>>> optimization is
>>>>>>>> definitely unimportant, so I agree with removing it here.
>>>>>>>> Replace the task list with llisy was something I considered
>>>>>>>> but I
>>>>>>>> gave
>>>>>>>> it up since it changes the list to a stack which means we
>>>>>>>> have to
>>>>>>>> handle
>>>>>>>> the tasks in a reverse order. This may affect the latency,
>>>>>>>> do you
>>>>>>>> have
>>>>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>>>>
>>>>>>>
>>>>>>> Do you have an idea for how to test that? I used a
>>>>>>> microbenchmark
>>>>>>> as
>>>>>>> well as a network benchmark [1] to verify that overall
>>>>>>> throughput
>>>>>>> is
>>>>>>> higher. TW latency sounds a lot more complicated to measure
>>>>>>> as it's
>>>>>>> difficult to trigger accurately.
>>>>>>>
>>>>>>> My feeling is that with reasonable batching (say 8-16 items)
>>>>>>> the
>>>>>>> latency will be low as TW is generally very quick, but if you
>>>>>>> have
>>>>>>> an
>>>>>>> idea for benchmarking I can take a look
>>>>>>>
>>>>>>> [1]: https://github.com/DylanZA/netbench
>>>>>>
>>>>>> It can be normal IO requests I think. We can test the latency
>>>>>> by fio
>>>>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>>>>> mode(since for non-SQPOLL, it doesn't make difference). This
>>>>>> way we
>>>>>> can
>>>>>> see the influence of reverse order handling.
>>>>>>
>>>>>> Regards,
>>>>>> Hao
>>>>>
>>>>> I see little difference locally, but there is quite a big stdev
>>>>> so it's
>>>>> possible my test setup is a bit wonky
>>>>>
>>>>> new:
>>>>> clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>>>> lat (nsec): min=1440, max=16719k, avg=119714.72,
>>>>> stdev=153571.49
>>>>> old:
>>>>> clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>>>> lat (nsec): min=1278, max=16610k, avg=121025.73,
>>>>> stdev=211896.14
>>>>>
>>>>
>>>> Hi Dylan,
>>>>
>>>> Could you post the arguments you use and the 99% 95% latency as
>>>> well?
>>>>
>>>> Regards,
>>>> Hao
>>>>
>>>
>>> One thing I'm worrying about is under heavy workloads, there are
>>> contiguous TWs coming in, thus the TWs at the end of the TW list
>>> doesn't
>>> get the chance to run, which leads to the latency of those ones
>>> becoming
>>> high.
>>
>> Pavel mentioned I should change some arguments, so I reran it. I'll
>> just post all the output below as not sure exactly what you are looking
>> for. Note I checked that it was definitely batching and it is doing
>> batches of 10-20 in tctx_task_work
>>
>>
>> *** config ***
>>
>> [global]
>> ioengine=io_uring
>> sqthread_poll=1
>> registerfiles=1
>> fixedbufs=1
>> hipri=0
>> thread=1
>> direct=0
>> rw=randread
>> time_based=1
>> runtime=600
>> ramp_time=30
>> randrepeat=0
>> group_reporting=0
>> sqthread_poll_cpu=15
>> iodepth=32
>>
>> [job0]
>> filename=/dev/nullb0
>> cpus_allowed=1
>> bs=512
>>
>> *** new ***
>> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
>> 512B, ioengine=io_uring, iodepth=32
>> fio-3.30-59-gd4bf5
>> Starting 1 thread
>> Jobs: 1 (f=0): [f(1)][100.0%][r=360MiB/s][r=738k IOPS][eta 00m:00s]
>> job0: (groupid=0, jobs=1): err= 0: pid=37255: Wed Jun 22 03:44:23 2022
>> read: IOPS=596k, BW=291MiB/s (305MB/s)(171GiB/600001msec)
>> clat (msec): min=30343, max=630343, avg=369885.75, stdev=164921.26
>> lat (usec): min=14, max=1802, avg=53.23, stdev=18.84
>> clat percentiles (msec):
>> | 1.00th=[17113], 5.00th=[17113], 10.00th=[17113],
>> 20.00th=[17113],
>> | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
>> 60.00th=[17113],
>> | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
>> 95.00th=[17113],
>> | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
>> 99.95th=[17113],
>> | 99.99th=[17113]
>> bw ( KiB/s): min=169237, max=381603, per=100.00%, avg=298171.22,
>> stdev=70580.65, samples=1199
>> iops : min=338474, max=763206, avg=596342.60,
>> stdev=141161.31, samples=1199
>> lat (msec) : >=2000=100.00%
>> cpu : usr=99.98%, sys=0.00%, ctx=4378, majf=0, minf=9
>> IO depths : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>>> =64=0.0%
>> submit : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>>> =64=0.0%
>> complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>>> =64=0.0%
>> issued rwts: total=357661967,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>> latency : target=0, window=0, percentile=100.00%, depth=32
>>
>> Run status group 0 (all jobs):
>> READ: bw=291MiB/s (305MB/s), 291MiB/s-291MiB/s (305MB/s-305MB/s),
>> io=171GiB (183GB), run=600001-600001msec
>>
>> Disk stats (read/write):
>> nullb0: ios=72127555/0, merge=11/0, ticks=1396298/0,
>> in_queue=1396298, util=100.00%
>> *** old ***
>>
>> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
>> 512B, ioengine=io_uring, iodepth=32
>> fio-3.30-59-gd4bf5
>> Starting 1 thread
>> Jobs: 1 (f=1): [r(1)][100.0%][r=367MiB/s][r=751k IOPS][eta 00m:00s]
>> job0: (groupid=0, jobs=1): err= 0: pid=19216: Wed Jun 22 04:43:36 2022
>> read: IOPS=609k, BW=297MiB/s (312MB/s)(174GiB/600001msec)
>> clat (msec): min=30333, max=630333, avg=368961.53, stdev=164532.01
>> lat (usec): min=14, max=5830, avg=52.11, stdev=18.64
>> clat percentiles (msec):
>> | 1.00th=[17113], 5.00th=[17113], 10.00th=[17113],
>> 20.00th=[17113],
>> | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
>> 60.00th=[17113],
>> | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
>> 95.00th=[17113],
>> | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
>> 99.95th=[17113],
>> | 99.99th=[17113]
>> bw ( KiB/s): min=170273, max=386932, per=100.00%, avg=304548.39,
>> stdev=70732.20, samples=1200
>> iops : min=340547, max=773864, avg=609096.94,
>> stdev=141464.41, samples=1200
>> lat (msec) : >=2000=100.00%
>> cpu : usr=99.98%, sys=0.00%, ctx=3912, majf=0, minf=5
>> IO depths : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>>> =64=0.0%
>> submit : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>>> =64=0.0%
>> complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>>> =64=0.0%
>> issued rwts: total=365258392,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>> latency : target=0, window=0, percentile=100.00%, depth=32
>>
>> Run status group 0 (all jobs):
>> READ: bw=297MiB/s (312MB/s), 297MiB/s-297MiB/s (312MB/s-312MB/s),
>> io=174GiB (187GB), run=600001-600001msec
>>
>> Disk stats (read/write):
>> nullb0: ios=69031421/0, merge=1/0, ticks=1323086/0, in_queue=1323086,
>> util=100.00%
>>
>
> Ok, the clat percentiles seems meanless here... from the min max and avg
^ meaningless
> data it should be fine. Thanks for testing.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-22 11:24 ` Hao Xu
2022-06-22 11:51 ` Dylan Yudaken
@ 2022-06-22 11:52 ` Hao Xu
1 sibling, 0 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-22 11:52 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/22/22 19:24, Hao Xu wrote:
> On 6/22/22 19:16, Hao Xu wrote:
>> On 6/22/22 17:31, Dylan Yudaken wrote:
>>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>>> Task work currently uses a spin lock to guard task_list and
>>>>>>> task_running. Some use cases such as networking can trigger
>>>>>>> task_work_add
>>>>>>> from multiple threads all at once, which suffers from
>>>>>>> contention
>>>>>>> here.
>>>>>>>
>>>>>>> This can be changed to use a lockless list which seems to have
>>>>>>> better
>>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>>> improvment in
>>>>>>> multithreaded task work add. It required removing the priority
>>>>>>> tw
>>>>>>> list
>>>>>>> optimisation, however it isn't clear how important that
>>>>>>> optimisation is.
>>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>>
>>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>>> Patch 3-5 add lockless lists for task work
>>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>>
>>>>>>
>>>>>> Compared to the spinlock overhead, the prio task list
>>>>>> optimization is
>>>>>> definitely unimportant, so I agree with removing it here.
>>>>>> Replace the task list with llisy was something I considered but I
>>>>>> gave
>>>>>> it up since it changes the list to a stack which means we have to
>>>>>> handle
>>>>>> the tasks in a reverse order. This may affect the latency, do you
>>>>>> have
>>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>>
>>>>>
>>>>> Do you have an idea for how to test that? I used a microbenchmark
>>>>> as
>>>>> well as a network benchmark [1] to verify that overall throughput
>>>>> is
>>>>> higher. TW latency sounds a lot more complicated to measure as it's
>>>>> difficult to trigger accurately.
>>>>>
>>>>> My feeling is that with reasonable batching (say 8-16 items) the
>>>>> latency will be low as TW is generally very quick, but if you have
>>>>> an
>>>>> idea for benchmarking I can take a look
>>>>>
>>>>> [1]: https://github.com/DylanZA/netbench
>>>>
>>>> It can be normal IO requests I think. We can test the latency by fio
>>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>>> mode(since for non-SQPOLL, it doesn't make difference). This way we
>>>> can
>>>> see the influence of reverse order handling.
>>>>
>>>> Regards,
>>>> Hao
>>>
>>> I see little difference locally, but there is quite a big stdev so it's
>>> possible my test setup is a bit wonky
>>>
>>> new:
>>> clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>> lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
>>> old:
>>> clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>> lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
>>>
>>
>> Hi Dylan,
>>
>> Could you post the arguments you use and the 99% 95% latency as well?
>>
>> Regards,
>> Hao
>>
>
> One thing I'm worrying about is under heavy workloads, there are
> contiguous TWs coming in, thus the TWs at the end of the TW list doesn't
> get the chance to run, which leads to the latency of those ones becoming
> high.
Ah, looked at the code again, seems we take the whole list not a single
node at each time, so it shouldn't be a big problem.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
2022-06-21 7:03 ` Dylan Yudaken
2022-06-21 7:34 ` Hao Xu
@ 2022-06-21 7:38 ` Hao Xu
1 sibling, 0 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-21 7:38 UTC (permalink / raw)
To: Dylan Yudaken, [email protected], [email protected],
[email protected]
Cc: Kernel Team
On 6/21/22 15:03, Dylan Yudaken wrote:
> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>> Task work currently uses a spin lock to guard task_list and
>>> task_running. Some use cases such as networking can trigger
>>> task_work_add
>>> from multiple threads all at once, which suffers from contention
>>> here.
>>>
>>> This can be changed to use a lockless list which seems to have
>>> better
>>> performance. Running the micro benchmark in [1] I see 20%
>>> improvment in
>>> multithreaded task work add. It required removing the priority tw
>>> list
>>> optimisation, however it isn't clear how important that
>>> optimisation is.
>>> Additionally it has fairly easy to break semantics.
>>>
>>> Patch 1-2 remove the priority tw list optimisation
>>> Patch 3-5 add lockless lists for task work
>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>> Patch 7-8 adds tracing for task_work_run
>>>
>>
>> Compared to the spinlock overhead, the prio task list optimization is
>> definitely unimportant, so I agree with removing it here.
>> Replace the task list with llisy was something I considered but I
>> gave
>> it up since it changes the list to a stack which means we have to
>> handle
>> the tasks in a reverse order. This may affect the latency, do you
>> have
>> some numbers for it, like avg and 99% 95% lat?
>>
>
> Do you have an idea for how to test that? I used a microbenchmark as
> well as a network benchmark [1] to verify that overall throughput is
> higher. TW latency sounds a lot more complicated to measure as it's
I think measuring the end to end latency is a way good enough, no need
to get the accurate number of TW queueing time.
> difficult to trigger accurately.
>
> My feeling is that with reasonable batching (say 8-16 items) the
> latency will be low as TW is generally very quick, but if you have an
> idea for benchmarking I can take a look
>
> [1]: https://github.com/DylanZA/netbench
^ permalink raw reply [flat|nested] 20+ messages in thread