public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH liburing] test: Add kworker-hang test
@ 2021-10-19 19:07 Ammar Faizi
  2021-10-20 12:51 ` [PATCH v2 " Ammar Faizi
  0 siblings, 1 reply; 11+ messages in thread
From: Ammar Faizi @ 2021-10-19 19:07 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Pavel Begunkov, io-uring Mailing List, Ammar Faizi

Add kworker-hang test to reproduce this:

  [11133.980033] INFO: task kworker/u8:3:16315 blocked for more than 10 seconds.
  [11133.980041]       Tainted: G           OE     5.15.0-rc4-for-5.16-io-uring-00060-g4922ab639eb6 #4
  [11133.980047] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [11133.980052] task:kworker/u8:3    state:D stack:    0 pid:16315 ppid:     2 flags:0x00004000
  [11133.980061] Workqueue: events_unbound io_ring_exit_work
  [11133.980068] Call Trace:
  [11133.980076]  __schedule+0x453/0x1850
  [11133.980097]  ? usleep_range+0x90/0x90
  [11133.980104]  schedule+0x59/0xc0
  [11133.980111]  schedule_timeout+0x1aa/0x1f0
  [11133.980119]  ? mark_held_locks+0x49/0x70
  [11133.980128]  ? lockdep_hardirqs_on_prepare+0xff/0x180
  [11133.980134]  ? _raw_spin_unlock_irq+0x24/0x40
  [11133.980143]  __wait_for_common+0xc2/0x170
  [11133.980158]  io_ring_exit_work+0x42c/0x44a
  [11133.980166]  ? io_uring_del_tctx_node+0xad/0xad
  [11133.980178]  ? verify_cpu+0xf0/0x100
  [11133.980193]  process_one_work+0x23b/0x550
  [11133.980209]  worker_thread+0x55/0x3c0
  [11133.980214]  ? process_one_work+0x550/0x550
  [11133.980223]  kthread+0x140/0x160
  [11133.980229]  ? set_kthread_struct+0x40/0x40
  [11133.980237]  ret_from_fork+0x1f/0x30
  [11133.980278]
                 Showing all locks held in the system:
  [11133.980282] 2 locks held by kworker/u8:0/8:
  [11133.980286]  #0: ffff888100106938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1c1/0x550
  [11133.980304]  #1: ffffc90000187e70 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x1c1/0x550
  [11133.980322] 1 lock held by khungtaskd/39:
  [11133.980325]  #0: ffffffff82977740 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x174
  [11133.980348] 2 locks held by kworker/u8:9/760:
  [11133.980351]  #0: ffff888100106938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1c1/0x550
  [11133.980366]  #1: ffffc9000176fe70 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x1c1/0x550
  [11133.980385] 1 lock held by in:imklog/926:
  [11133.980388]  #0: ffff88813208c2f0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4a/0x60
  [11133.980434] 1 lock held by htop/13609:
  [11133.980438] 2 locks held by kworker/u8:3/16315:
  [11133.980441]  #0: ffff888100106938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x1c1/0x550
  [11133.980456]  #1: ffffc90008d43e70 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x1c1/0x550
  [11133.980475] 1 lock held by dmesg/18421:
  [11133.980478]  #0: ffff88824be300d0 (&user->lock){+.+.}-{3:3}, at: devkmsg_read+0x4b/0x230

  [11133.980498] =============================================

Cc: Pavel Begunkov <[email protected]>
Link: https://github.com/axboe/liburing/issues/448
Signed-off-by: Ammar Faizi <[email protected]>
---
 test/Makefile       |   2 +
 test/kworker-hang.c | 289 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 291 insertions(+)
 create mode 100644 test/kworker-hang.c

diff --git a/test/Makefile b/test/Makefile
index 1a10a24..7c8691d 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -75,6 +75,7 @@ test_targets += \
 	io_uring_register \
 	io_uring_setup \
 	iopoll \
+	kworker-hang \
 	lfs-openat \
 	lfs-openat-write \
 	link \
@@ -227,6 +228,7 @@ test_srcs := \
 	io_uring_register.c \
 	io_uring_setup.c \
 	iopoll.c \
+	kworker-hang.c \
 	lfs-openat-write.c \
 	lfs-openat.c \
 	link-timeout.c \
diff --git a/test/kworker-hang.c b/test/kworker-hang.c
new file mode 100644
index 0000000..577b124
--- /dev/null
+++ b/test/kworker-hang.c
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: MIT */
+
+/*
+ * kworker-hang
+ *
+ * Link: https://github.com/axboe/liburing/issues/448
+ */
+
+#include <dirent.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/poll.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include "helpers.h"
+#include "liburing.h"
+
+#define NR_RINGS			5
+#define WAIT_FOR_KWORKER_SECS		10
+#define WAIT_FOR_KWORKER_SECS_STR	"10"
+
+static void run_child(void)
+{
+	int ret, i;
+	struct io_uring rings[NR_RINGS];
+
+	for (i = 0; i < NR_RINGS; i++) {
+		struct io_uring_params p = { };
+		ret = io_uring_queue_init_params(64, &rings[i], &p);
+		if (ret) {
+			fprintf(stderr, "queue_init: %d/%d\n", ret, i);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < NR_RINGS; i++)
+		io_uring_queue_exit(&rings[i]);
+
+	kill(getpid(), SIGSTOP);
+
+	/* We may have kworkers hang here, let the parent check it. */
+	return;
+
+err:
+	/* We can't create io_uring context */
+	exit(1);
+}
+
+static bool is_all_numeric(const char *pid)
+{
+	size_t i, l;
+	char c;
+
+	l = strnlen(pid, 32);
+	if (l == 0)
+		return false;
+
+	for (i = 0; i < l; i++) {
+		c = pid[i];
+		if (!('0' <= c && c <= '9'))
+			return false;
+	}
+
+	return true;
+}
+
+static bool is_kworker_event_unbound(const char *pid)
+{
+	int fd;
+	bool ret = false;
+	char fpath[256];
+	char read_buf[256] = {0};
+	ssize_t read_size;
+
+	snprintf(fpath, sizeof(fpath), "/proc/%s/comm", pid);
+
+	fd = open(fpath, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	read_size = read(fd, read_buf, sizeof(read_buf) - 1);
+	if (read_size < 0)
+		goto out;
+
+	if (!strncmp(read_buf, "kworker", 7) && strstr(read_buf, "events_unbound"))
+		ret = true;
+out:
+	close(fd);
+	return ret;
+}
+
+static bool is_on_ring_exit_work(const char *pid)
+{
+	int fd;
+	bool ret = false;
+	char fpath[256];
+	char read_buf[4096] = {0};
+	ssize_t read_size;
+
+	snprintf(fpath, sizeof(fpath), "/proc/%s/stack", pid);
+
+	fd = open(fpath, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	read_size = read(fd, read_buf, sizeof(read_buf) - 1);
+	if (read_size < 0)
+		goto out;
+
+	if (strstr(read_buf, "io_ring_exit_work"))
+		ret = true;
+out:
+	close(fd);
+	return ret;
+}
+
+static bool is_in_dreaded_d_state(const char *pid)
+{
+	int fd;
+	bool ret = false;
+	char fpath[256];
+	char read_buf[4096] = {0};
+	ssize_t read_size;
+	const char *p = read_buf;
+
+	snprintf(fpath, sizeof(fpath), "/proc/%s/stat", pid);
+
+	fd = open(fpath, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	read_size = read(fd, read_buf, sizeof(read_buf) - 1);
+	if (read_size < 0)
+		goto out;
+
+	/*
+	 * It looks like this:
+	 * 9384 (kworker/u8:8+events_unbound) D 2 0 0 0 -1 69238880 0 0 0 0 0 0 0 0 20 0 1 0 
+	 *
+	 * Catch the 'D'!
+	 */
+	while (*p != ')') {
+		p++;
+		if (&p[2] >= &read_buf[sizeof(read_buf) - 1])
+			/*
+			 * /proc/$pid/stack shows the wrong output?
+			 */
+			goto out;
+	}
+
+	ret = (p[2] == 'D');
+out:
+	close(fd);
+	return ret;
+}
+
+/*
+ * Returns 1 if we have kworker hang or fail to open `/proc`.
+ */
+static int scan_kworker_hang(void)
+{
+	DIR *dr;
+	int ret = 0;
+	struct dirent *de;
+
+	dr = opendir("/proc");
+	if (dr == NULL) {
+		perror("opendir");
+		return 1;
+	}
+
+	while (1) {
+		const char *pid;
+
+		de = readdir(dr);
+		if (!de)
+			break;
+
+		pid = de->d_name;
+		if (!is_all_numeric(pid))
+			continue;
+
+		if (!is_kworker_event_unbound(pid))
+			continue;
+
+		if (!is_on_ring_exit_work(pid))
+			continue;
+
+		if (is_in_dreaded_d_state(pid)) {
+			/* kworker is hang?! */
+			printf("Bug: found hang kworker on io_ring_exit_work "
+				"/proc/%s\n", pid);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+static void set_hung_entries(void)
+{
+	const char *cmds[] = {
+		/* Backup current values */
+		"cat /proc/sys/kernel/hung_task_all_cpu_backtrace > hung_task_all_cpu_backtrace.bak",
+		"cat /proc/sys/kernel/hung_task_check_count > hung_task_check_count.bak",
+		"cat /proc/sys/kernel/hung_task_panic > hung_task_panic.bak",
+		"cat /proc/sys/kernel/hung_task_check_interval_secs > hung_task_check_interval_secs.bak",
+		"cat /proc/sys/kernel/hung_task_timeout_secs > hung_task_timeout_secs.bak",
+		"cat /proc/sys/kernel/hung_task_warnings > hung_task_warnings.bak",
+
+		/* Set to do the test. */
+		"echo 1 > /proc/sys/kernel/hung_task_all_cpu_backtrace",
+		"echo 99999999 > /proc/sys/kernel/hung_task_check_count",
+		"echo 0 > /proc/sys/kernel/hung_task_panic",
+		"echo 1 > /proc/sys/kernel/hung_task_check_interval_secs",
+		"echo " WAIT_FOR_KWORKER_SECS_STR " > /proc/sys/kernel/hung_task_timeout_secs",
+		"echo -1 > /proc/sys/kernel/hung_task_warnings",
+	};
+	int p;
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(cmds); i++)
+		p = system(cmds[i]);
+
+	(void)p;
+}
+
+static void restore_hung_entries(void)
+{
+	const char *cmds[] = {
+		"cat hung_task_all_cpu_backtrace.bak > /proc/sys/kernel/hung_task_all_cpu_backtrace",
+		"cat hung_task_check_count.bak > /proc/sys/kernel/hung_task_check_count",
+		"cat hung_task_panic.bak > /proc/sys/kernel/hung_task_panic",
+		"cat hung_task_check_interval_secs.bak > /proc/sys/kernel/hung_task_check_interval_secs",
+		"cat hung_task_timeout_secs.bak > /proc/sys/kernel/hung_task_timeout_secs",
+		"cat hung_task_warnings.bak > /proc/sys/kernel/hung_task_warnings",
+
+		/* Clean up! */
+		"rm -f " \
+		"hung_task_all_cpu_backtrace.bak " \
+		"hung_task_check_count.bak " \
+		"hung_task_panic.bak " \
+		"hung_task_check_interval_secs.bak " \
+		"hung_task_timeout_secs.bak " \
+		"hung_task_warnings.bak"
+	};
+	int p;
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(cmds); i++)
+		p = system(cmds[i]);
+
+	(void)p;
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+	pid_t child_pid;
+
+	if (getuid() != 0 && geteuid() != 0) {
+		printf("Skipping kworker-hang: not root\n");
+		return 0;
+	}
+
+	set_hung_entries();
+	child_pid = fork();
+
+	if (!child_pid) {
+		run_child();
+		return 0;
+	}
+
+	/* +2 just to add small extra time for khungtaskd. */
+	sleep(WAIT_FOR_KWORKER_SECS + 2);
+	ret = scan_kworker_hang();
+	restore_hung_entries();
+
+	kill(child_pid, SIGCONT);
+	wait(NULL);
+	return ret;
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-11-09 15:21 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-10-19 19:07 [PATCH liburing] test: Add kworker-hang test Ammar Faizi
2021-10-20 12:51 ` [PATCH v2 " Ammar Faizi
2021-11-05 23:35   ` Ammar Faizi
2021-11-05 23:41     ` Jens Axboe
2021-11-06  0:58       ` [PATCH v3 " Ammar Faizi
2021-11-06  7:42         ` [PATCH v4 " Ammar Faizi
2021-11-06 11:37           ` [PATCH v5 " Ammar Faizi
2021-11-06 11:49             ` [PATCH v6 " Ammar Faizi
2021-11-09  5:44               ` Ammar Faizi
2021-11-09 14:47                 ` Jens Axboe
2021-11-09 15:21                   ` Ammar Faizi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox