public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH v3] io_uring: add timeout support for io_uring_enter()
@ 2020-11-02  8:50 Hao Xu
  2020-11-03  2:54 ` [PATCH v3 RESEND] " Hao Xu
  0 siblings, 1 reply; 12+ messages in thread
From: Hao Xu @ 2020-11-02  8:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

From: Hao Xu <[email protected]>

Now users who want to get woken when waiting for events should submit a
timeout command first. It is not safe for applications that split SQ and
CQ handling between two threads, such as mysql. Users should synchronize
the two threads explicitly to protect SQ and that will impact the
performance.

This patch adds support for timeout to existing io_uring_enter(). To
avoid overloading arguments, it introduces a new parameter structure
which contains sigmask and timeout.

I have tested the workloads with one thread submiting nop requests
while the other reaping the cqe with timeout. It shows 1.8~2x faster
when the iodepth is 16.

Signed-off-by: Jiufei Xue <[email protected]>
Signed-off-by: Hao Xu <[email protected]>
---
 fs/io_uring.c                 | 48 +++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/io_uring.h |  7 +++++++
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91e2cc8414f9..cd89a7fbaafd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6653,7 +6653,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
-			  const sigset_t __user *sig, size_t sigsz)
+			  const sigset_t __user *sig, size_t sigsz,
+			  struct __kernel_timespec __user *uts)
 {
 	struct io_wait_queue iowq = {
 		.wq = {
@@ -6665,6 +6666,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
+	struct timespec64 ts;
+	signed long timeout = 0;
 	int ret = 0;
 
 	do {
@@ -6687,6 +6690,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
+	if (uts) {
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		timeout = timespec64_to_jiffies(&ts);
+	}
+
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -6708,7 +6717,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		}
 		if (io_should_wake(&iowq, false))
 			break;
-		schedule();
+		if (uts) {
+			timeout = schedule_timeout(timeout);
+			if (timeout == 0) {
+				ret = -ETIME;
+				break;
+			}
+		} else {
+			schedule();
+		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
 
@@ -8207,19 +8224,38 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 #endif /* !CONFIG_MMU */
 
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
-		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		u32, min_complete, u32, flags, const void __user *, argp,
 		size_t, sigsz)
 {
 	struct io_ring_ctx *ctx;
 	long ret = -EBADF;
 	int submitted = 0;
 	struct fd f;
+	const sigset_t __user *sig;
+	struct __kernel_timespec __user *ts;
+	struct io_uring_getevents_arg arg;
 
 	io_run_task_work();
 
-	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+		      IORING_ENTER_GETEVENTS_TIMEOUT))
 		return -EINVAL;
 
+	/* deal with IORING_ENTER_GETEVENTS_TIMEOUT */
+	if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) {
+		if (!(flags & IORING_ENTER_GETEVENTS))
+			return -EINVAL;
+		if (sigsz != sizeof(arg))
+			return -EINVAL;
+		if (copy_from_user(&arg, argp, sizeof(arg)))
+			return -EFAULT;
+		sig = (const sigset_t __user *)arg.sigmask;
+		ts = arg.ts;
+	} else {
+		sig = (const sigset_t __user *)argp;
+		ts = NULL;
+	}
+
 	f = fdget(fd);
 	if (!f.file)
 		return -EBADF;
@@ -8266,7 +8302,7 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
 			ret = io_iopoll_check(ctx, min_complete);
 		} else {
-			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz, ts);
 		}
 	}
 
@@ -8572,7 +8608,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
-			IORING_FEAT_POLL_32BITS;
+			IORING_FEAT_POLL_32BITS | IORING_FEAT_GETEVENTS_TIMEOUT;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d65fde732518..68b94617981a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -224,6 +224,7 @@ struct io_cqring_offsets {
  */
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
+#define IORING_ENTER_GETEVENTS_TIMEOUT	(1U << 2)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -251,6 +252,7 @@ struct io_uring_params {
 #define IORING_FEAT_CUR_PERSONALITY	(1U << 4)
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
+#define IORING_FEAT_GETEVENTS_TIMEOUT	(1U << 7)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -290,4 +292,9 @@ struct io_uring_probe {
 	struct io_uring_probe_op ops[0];
 };
 
+struct io_uring_getevents_arg {
+	u64 *sigmask;
+	struct __kernel_timespec *ts;
+};
+
 #endif
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-02  8:50 [PATCH v3] io_uring: add timeout support for io_uring_enter() Hao Xu
@ 2020-11-03  2:54 ` Hao Xu
  2020-11-04 17:50   ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Hao Xu @ 2020-11-03  2:54 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

Now users who want to get woken when waiting for events should submit a
timeout command first. It is not safe for applications that split SQ and
CQ handling between two threads, such as mysql. Users should synchronize
the two threads explicitly to protect SQ and that will impact the
performance.

This patch adds support for timeout to existing io_uring_enter(). To
avoid overloading arguments, it introduces a new parameter structure
which contains sigmask and timeout.

I have tested the workloads with one thread submiting nop requests
while the other reaping the cqe with timeout. It shows 1.8~2x faster
when the iodepth is 16.

Signed-off-by: Jiufei Xue <[email protected]>
Signed-off-by: Hao Xu <[email protected]>
---
 fs/io_uring.c                 | 48 +++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/io_uring.h |  7 +++++++
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91e2cc8414f9..cd89a7fbaafd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6653,7 +6653,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
-			  const sigset_t __user *sig, size_t sigsz)
+			  const sigset_t __user *sig, size_t sigsz,
+			  struct __kernel_timespec __user *uts)
 {
 	struct io_wait_queue iowq = {
 		.wq = {
@@ -6665,6 +6666,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
+	struct timespec64 ts;
+	signed long timeout = 0;
 	int ret = 0;
 
 	do {
@@ -6687,6 +6690,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
+	if (uts) {
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		timeout = timespec64_to_jiffies(&ts);
+	}
+
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -6708,7 +6717,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		}
 		if (io_should_wake(&iowq, false))
 			break;
-		schedule();
+		if (uts) {
+			timeout = schedule_timeout(timeout);
+			if (timeout == 0) {
+				ret = -ETIME;
+				break;
+			}
+		} else {
+			schedule();
+		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
 
@@ -8207,19 +8224,38 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 #endif /* !CONFIG_MMU */
 
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
-		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		u32, min_complete, u32, flags, const void __user *, argp,
 		size_t, sigsz)
 {
 	struct io_ring_ctx *ctx;
 	long ret = -EBADF;
 	int submitted = 0;
 	struct fd f;
+	const sigset_t __user *sig;
+	struct __kernel_timespec __user *ts;
+	struct io_uring_getevents_arg arg;
 
 	io_run_task_work();
 
-	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
+	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+		      IORING_ENTER_GETEVENTS_TIMEOUT))
 		return -EINVAL;
 
+	/* deal with IORING_ENTER_GETEVENTS_TIMEOUT */
+	if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) {
+		if (!(flags & IORING_ENTER_GETEVENTS))
+			return -EINVAL;
+		if (sigsz != sizeof(arg))
+			return -EINVAL;
+		if (copy_from_user(&arg, argp, sizeof(arg)))
+			return -EFAULT;
+		sig = arg.sigmask;
+		ts = arg.ts;
+	} else {
+		sig = (const sigset_t __user *)argp;
+		ts = NULL;
+	}
+
 	f = fdget(fd);
 	if (!f.file)
 		return -EBADF;
@@ -8266,7 +8302,7 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
 			ret = io_iopoll_check(ctx, min_complete);
 		} else {
-			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz, ts);
 		}
 	}
 
@@ -8572,7 +8608,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
-			IORING_FEAT_POLL_32BITS;
+			IORING_FEAT_POLL_32BITS | IORING_FEAT_GETEVENTS_TIMEOUT;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d65fde732518..68b94617981a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -224,6 +224,7 @@ struct io_cqring_offsets {
  */
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
+#define IORING_ENTER_GETEVENTS_TIMEOUT	(1U << 2)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -251,6 +252,7 @@ struct io_uring_params {
 #define IORING_FEAT_CUR_PERSONALITY	(1U << 4)
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
+#define IORING_FEAT_GETEVENTS_TIMEOUT	(1U << 7)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -290,4 +292,9 @@ struct io_uring_probe {
 	struct io_uring_probe_op ops[0];
 };
 
+struct io_uring_getevents_arg {
+	sigset_t *sigmask;
+	struct __kernel_timespec *ts;
+};
+
 #endif
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-03  2:54 ` [PATCH v3 RESEND] " Hao Xu
@ 2020-11-04 17:50   ` Jens Axboe
  2020-11-04 18:32     ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 17:50 UTC (permalink / raw)
  To: Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/2/20 7:54 PM, Hao Xu wrote:
> Now users who want to get woken when waiting for events should submit a
> timeout command first. It is not safe for applications that split SQ and
> CQ handling between two threads, such as mysql. Users should synchronize
> the two threads explicitly to protect SQ and that will impact the
> performance.
> 
> This patch adds support for timeout to existing io_uring_enter(). To
> avoid overloading arguments, it introduces a new parameter structure
> which contains sigmask and timeout.
> 
> I have tested the workloads with one thread submiting nop requests
> while the other reaping the cqe with timeout. It shows 1.8~2x faster
> when the iodepth is 16.

I have applied this one for 5.11 with a caveat - you generated it against
some older base, so some parts had to be hand applied. But the important
bit is that the values you chose for >IORING_ENTER_GETEVENTS_TIMEOUT and
IORING_FEAT_GETEVENTS_TIMEOUT are already in use in 5.10 (let alone
5.11 pending), so they had to be renumbered. Just something to keep in
mind if you have existing code/apps that rely on the value in your
patches.

It'd also be great if you could submit a liburing path for adding these
definitions, and with a test case as well. All new features should come
with a test case for liburing. This one in particular will enable
io_uring_wait_cqes() to work without queueing an internal timeout, so
it'll be a nice cleanup. I might just do this one myself, unless you
feel so inclined to tackle that one, too.


commit f84ccf564ee28205f87bea4f3925cf9a4c2ad0e3
Author: Hao Xu <[email protected]>
Date:   Tue Nov 3 10:54:37 2020 +0800

    io_uring: add timeout support for io_uring_enter()
    
    Now users who want to get woken when waiting for events should submit a
    timeout command first. It is not safe for applications that split SQ and
    CQ handling between two threads, such as mysql. Users should synchronize
    the two threads explicitly to protect SQ and that will impact the
    performance.
    
    This patch adds support for timeout to existing io_uring_enter(). To
    avoid overloading arguments, it introduces a new parameter structure
    which contains sigmask and timeout.
    
    I have tested the workloads with one thread submiting nop requests
    while the other reaping the cqe with timeout. It shows 1.8~2x faster
    when the iodepth is 16.
    
    Signed-off-by: Jiufei Xue <[email protected]>
    Signed-off-by: Hao Xu <[email protected]>
    Signed-off-by: Jens Axboe <[email protected]>

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 864751d64097..9b9941e0b818 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7110,7 +7110,8 @@ static int io_run_task_work_sig(void)
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
-			  const sigset_t __user *sig, size_t sigsz)
+			  const sigset_t __user *sig, size_t sigsz,
+			  struct __kernel_timespec __user *uts)
 {
 	struct io_wait_queue iowq = {
 		.wq = {
@@ -7122,6 +7123,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
+	struct timespec64 ts;
+	signed long timeout = 0;
 	int ret = 0;
 
 	do {
@@ -7144,6 +7147,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
+	if (uts) {
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		timeout = timespec64_to_jiffies(&ts);
+	}
+
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -7157,7 +7166,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			break;
 		if (io_should_wake(&iowq, false))
 			break;
-		schedule();
+		if (uts) {
+			timeout = schedule_timeout(timeout);
+			if (timeout == 0) {
+				ret = -ETIME;
+				break;
+			}
+		} else {
+			schedule();
+		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
 
@@ -9130,20 +9147,38 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 }
 
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
-		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		u32, min_complete, u32, flags, const void __user *, argp,
 		size_t, sigsz)
 {
 	struct io_ring_ctx *ctx;
 	long ret = -EBADF;
 	int submitted = 0;
 	struct fd f;
+	const sigset_t __user *sig;
+	struct __kernel_timespec __user *ts;
+	struct io_uring_getevents_arg arg;
 
 	io_run_task_work();
 
 	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			IORING_ENTER_SQ_WAIT))
+			IORING_ENTER_SQ_WAIT | IORING_ENTER_GETEVENTS_TIMEOUT))
 		return -EINVAL;
 
+	/* deal with IORING_ENTER_GETEVENTS_TIMEOUT */
+	if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) {
+		if (!(flags & IORING_ENTER_GETEVENTS))
+			return -EINVAL;
+		if (sigsz != sizeof(arg))
+			return -EINVAL;
+		if (copy_from_user(&arg, argp, sizeof(arg)))
+			return -EFAULT;
+		sig = arg.sigmask;
+		ts = arg.ts;
+	} else {
+		sig = (const sigset_t __user *)argp;
+		ts = NULL;
+	}
+
 	f = fdget(fd);
 	if (!f.file)
 		return -EBADF;
@@ -9199,7 +9234,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
 			ret = io_iopoll_check(ctx, min_complete);
 		} else {
-			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz, ts);
 		}
 	}
 
@@ -9561,7 +9596,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
-			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED;
+			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+			IORING_FEAT_GETEVENTS_TIMEOUT;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 557e7eae497f..fefee28c3ed8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,6 +231,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 #define IORING_ENTER_SQ_WAIT	(1U << 2)
+#define IORING_ENTER_GETEVENTS_TIMEOUT	(1U << 3)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -259,6 +260,7 @@ struct io_uring_params {
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
 #define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
+#define IORING_FEAT_GETEVENTS_TIMEOUT	(1U << 8)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -335,4 +337,9 @@ enum {
 	IORING_RESTRICTION_LAST
 };
 
+struct io_uring_getevents_arg {
+	sigset_t *sigmask;
+	struct __kernel_timespec *ts;
+};
+
 #endif

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 17:50   ` Jens Axboe
@ 2020-11-04 18:32     ` Jens Axboe
  2020-11-04 19:06       ` Jens Axboe
  2020-11-04 19:27       ` Pavel Begunkov
  0 siblings, 2 replies; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 18:32 UTC (permalink / raw)
  To: Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/4/20 10:50 AM, Jens Axboe wrote:
> +struct io_uring_getevents_arg {
> +	sigset_t *sigmask;
> +	struct __kernel_timespec *ts;
> +};
> +

I missed that this is still not right, I did bring it up in your last
posting though - you can't have pointers as a user API, since the size
of the pointer will vary depending on whether this is a 32-bit or 64-bit
arch (or 32-bit app running on 64-bit kernel).

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7e6945383907..2f533f6815ea 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9158,8 +9158,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			return -EINVAL;
 		if (copy_from_user(&arg, argp, sizeof(arg)))
 			return -EFAULT;
-		sig = arg.sigmask;
-		ts = arg.ts;
+		sig = u64_to_user_ptr(arg.sigmask);
+		ts = u64_to_user_ptr(arg.ts);
 	} else {
 		sig = (const sigset_t __user *)argp;
 		ts = NULL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index fefee28c3ed8..0b104891df68 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -338,8 +338,8 @@ enum {
 };
 
 struct io_uring_getevents_arg {
-	sigset_t *sigmask;
-	struct __kernel_timespec *ts;
+	__u64	sigmask;
+	__u64	ts;
 };
 
 #endif

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 18:32     ` Jens Axboe
@ 2020-11-04 19:06       ` Jens Axboe
  2020-11-04 19:27       ` Pavel Begunkov
  1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 19:06 UTC (permalink / raw)
  To: Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/4/20 11:32 AM, Jens Axboe wrote:
> On 11/4/20 10:50 AM, Jens Axboe wrote:
>> +struct io_uring_getevents_arg {
>> +	sigset_t *sigmask;
>> +	struct __kernel_timespec *ts;
>> +};
>> +
> 
> I missed that this is still not right, I did bring it up in your last
> posting though - you can't have pointers as a user API, since the size
> of the pointer will vary depending on whether this is a 32-bit or 64-bit
> arch (or 32-bit app running on 64-bit kernel).

You also made the sigmask size go away if we're using getevent_arg, we
need to include that. It'll break right now if you give both the sigmask
and a timeout, as you're passing in the total arg size for 'ts'.

Here's my (hopefully) final fixed version:


commit 1fda0f709ac2a51c7baa9899501dbf08883fa92c
Author: Hao Xu <[email protected]>
Date:   Tue Nov 3 10:54:37 2020 +0800

    io_uring: add timeout support for io_uring_enter()
    
    Now users who want to get woken when waiting for events should submit a
    timeout command first. It is not safe for applications that split SQ and
    CQ handling between two threads, such as mysql. Users should synchronize
    the two threads explicitly to protect SQ and that will impact the
    performance.
    
    This patch adds support for timeout to existing io_uring_enter(). To
    avoid overloading arguments, it introduces a new parameter structure
    which contains sigmask and timeout.
    
    I have tested the workloads with one thread submiting nop requests
    while the other reaping the cqe with timeout. It shows 1.8~2x faster
    when the iodepth is 16.
    
    Signed-off-by: Jiufei Xue <[email protected]>
    Signed-off-by: Hao Xu <[email protected]>
    Signed-off-by: Jens Axboe <[email protected]>

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 864751d64097..8439cda54e21 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7110,7 +7110,8 @@ static int io_run_task_work_sig(void)
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
-			  const sigset_t __user *sig, size_t sigsz)
+			  const sigset_t __user *sig, size_t sigsz,
+			  struct __kernel_timespec __user *uts)
 {
 	struct io_wait_queue iowq = {
 		.wq = {
@@ -7122,6 +7123,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
+	struct timespec64 ts;
+	signed long timeout = 0;
 	int ret = 0;
 
 	do {
@@ -7144,6 +7147,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
+	if (uts) {
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		timeout = timespec64_to_jiffies(&ts);
+	}
+
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -7157,7 +7166,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			break;
 		if (io_should_wake(&iowq, false))
 			break;
-		schedule();
+		if (uts) {
+			timeout = schedule_timeout(timeout);
+			if (timeout == 0) {
+				ret = -ETIME;
+				break;
+			}
+		} else {
+			schedule();
+		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
 
@@ -9130,20 +9147,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 }
 
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
-		u32, min_complete, u32, flags, const sigset_t __user *, sig,
+		u32, min_complete, u32, flags, const void __user *, argp,
 		size_t, sigsz)
 {
 	struct io_ring_ctx *ctx;
 	long ret = -EBADF;
 	int submitted = 0;
 	struct fd f;
+	const sigset_t __user *sig;
+	struct __kernel_timespec __user *ts;
+	struct io_uring_getevents_arg arg;
 
 	io_run_task_work();
 
 	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			IORING_ENTER_SQ_WAIT))
+			IORING_ENTER_SQ_WAIT | IORING_ENTER_GETEVENTS_TIMEOUT))
 		return -EINVAL;
 
+	/* deal with IORING_ENTER_GETEVENTS_TIMEOUT */
+	if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) {
+		if (!(flags & IORING_ENTER_GETEVENTS))
+			return -EINVAL;
+		if (sigsz != sizeof(arg))
+			return -EINVAL;
+		if (copy_from_user(&arg, argp, sizeof(arg)))
+			return -EFAULT;
+		sig = u64_to_user_ptr(arg.sigmask);
+		sigsz = arg.sigmask_sz;
+		ts = u64_to_user_ptr(arg.ts);
+	} else {
+		sig = (const sigset_t __user *)argp;
+		ts = NULL;
+	}
+
 	f = fdget(fd);
 	if (!f.file)
 		return -EBADF;
@@ -9199,7 +9235,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
 			ret = io_iopoll_check(ctx, min_complete);
 		} else {
-			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+			ret = io_cqring_wait(ctx, min_complete, sig, sigsz, ts);
 		}
 	}
 
@@ -9561,7 +9597,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
-			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED;
+			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+			IORING_FEAT_GETEVENTS_TIMEOUT;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 557e7eae497f..1a92985a9ee8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,6 +231,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 #define IORING_ENTER_SQ_WAIT	(1U << 2)
+#define IORING_ENTER_GETEVENTS_TIMEOUT	(1U << 3)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -259,6 +260,7 @@ struct io_uring_params {
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
 #define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
+#define IORING_FEAT_GETEVENTS_TIMEOUT	(1U << 8)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -335,4 +337,11 @@ enum {
 	IORING_RESTRICTION_LAST
 };
 
+struct io_uring_getevents_arg {
+	__u64	sigmask;
+	__u32	sigmask_sz;
+	__u32	pad;
+	__u64	ts;
+};
+
 #endif

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 18:32     ` Jens Axboe
  2020-11-04 19:06       ` Jens Axboe
@ 2020-11-04 19:27       ` Pavel Begunkov
  2020-11-04 19:34         ` Jens Axboe
  1 sibling, 1 reply; 12+ messages in thread
From: Pavel Begunkov @ 2020-11-04 19:27 UTC (permalink / raw)
  To: Jens Axboe, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 04/11/2020 18:32, Jens Axboe wrote:
> On 11/4/20 10:50 AM, Jens Axboe wrote:
>> +struct io_uring_getevents_arg {
>> +	sigset_t *sigmask;
>> +	struct __kernel_timespec *ts;
>> +};
>> +
> 
> I missed that this is still not right, I did bring it up in your last
> posting though - you can't have pointers as a user API, since the size
> of the pointer will vary depending on whether this is a 32-bit or 64-bit
> arch (or 32-bit app running on 64-bit kernel).

Maybe it would be better 

1) to kill this extra indirection?

struct io_uring_getevents_arg {
-	sigset_t *sigmask;
-	struct __kernel_timespec *ts;
+	sigset_t sigmask;
+	struct __kernel_timespec ts;
};

then,

sigset_t *sig = (...)arg;
__kernel_timespec* ts = (...)(arg + offset);


It'd spare us from IORING_ENTER_GETEVENTS_TIMEOUT but we'd need
to find a way to disable some of them. E.g. don't use sigmask when
user don't want it, but sigsz == sizeof(io_uring_getevents_arg),

and parsing would look like

switch (argsz) {
case sizeof(struct io_uring_getevents_arg): {
	struct __kernel_timespec ts = argp + ts_offset;
	...
}
fallthrough;
case sizeof(sig): {
	const sigset_t __user *sig = argp;
	...
	break;
}
default:
	return -EINVAL;
}

2) and move all the parsing into io_cqring_wait(). That sounds better
performance-wise.

> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 7e6945383907..2f533f6815ea 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -9158,8 +9158,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>  			return -EINVAL;
>  		if (copy_from_user(&arg, argp, sizeof(arg)))
>  			return -EFAULT;
> -		sig = arg.sigmask;
> -		ts = arg.ts;
> +		sig = u64_to_user_ptr(arg.sigmask);
> +		ts = u64_to_user_ptr(arg.ts);
>  	} else {
>  		sig = (const sigset_t __user *)argp;
>  		ts = NULL;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index fefee28c3ed8..0b104891df68 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -338,8 +338,8 @@ enum {
>  };
>  
>  struct io_uring_getevents_arg {
> -	sigset_t *sigmask;
> -	struct __kernel_timespec *ts;
> +	__u64	sigmask;
> +	__u64	ts;
>  };
>  
>  #endif
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 19:27       ` Pavel Begunkov
@ 2020-11-04 19:34         ` Jens Axboe
  2020-11-04 20:16           ` Pavel Begunkov
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 19:34 UTC (permalink / raw)
  To: Pavel Begunkov, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/4/20 12:27 PM, Pavel Begunkov wrote:
> On 04/11/2020 18:32, Jens Axboe wrote:
>> On 11/4/20 10:50 AM, Jens Axboe wrote:
>>> +struct io_uring_getevents_arg {
>>> +	sigset_t *sigmask;
>>> +	struct __kernel_timespec *ts;
>>> +};
>>> +
>>
>> I missed that this is still not right, I did bring it up in your last
>> posting though - you can't have pointers as a user API, since the size
>> of the pointer will vary depending on whether this is a 32-bit or 64-bit
>> arch (or 32-bit app running on 64-bit kernel).
> 
> Maybe it would be better 
> 
> 1) to kill this extra indirection?
> 
> struct io_uring_getevents_arg {
> -	sigset_t *sigmask;
> -	struct __kernel_timespec *ts;
> +	sigset_t sigmask;
> +	struct __kernel_timespec ts;
> };
> 
> then,
> 
> sigset_t *sig = (...)arg;
> __kernel_timespec* ts = (...)(arg + offset);

But then it's kind of hard to know which, if any, of them are set... I
did think about this, and any solution seemed worse than just having the
extra indirection.

Yeah, not doing the extra indirection would save a copy, but don't think
it's worth it for this path.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 19:34         ` Jens Axboe
@ 2020-11-04 20:16           ` Pavel Begunkov
  2020-11-04 20:28             ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Pavel Begunkov @ 2020-11-04 20:16 UTC (permalink / raw)
  To: Jens Axboe, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 04/11/2020 19:34, Jens Axboe wrote:
> On 11/4/20 12:27 PM, Pavel Begunkov wrote:
>> On 04/11/2020 18:32, Jens Axboe wrote:
>>> On 11/4/20 10:50 AM, Jens Axboe wrote:
>>>> +struct io_uring_getevents_arg {
>>>> +	sigset_t *sigmask;
>>>> +	struct __kernel_timespec *ts;
>>>> +};
>>>> +
>>>
>>> I missed that this is still not right, I did bring it up in your last
>>> posting though - you can't have pointers as a user API, since the size
>>> of the pointer will vary depending on whether this is a 32-bit or 64-bit
>>> arch (or 32-bit app running on 64-bit kernel).
>>
>> Maybe it would be better 
>>
>> 1) to kill this extra indirection?
>>
>> struct io_uring_getevents_arg {
>> -	sigset_t *sigmask;
>> -	struct __kernel_timespec *ts;
>> +	sigset_t sigmask;
>> +	struct __kernel_timespec ts;
>> };
>>
>> then,
>>
>> sigset_t *sig = (...)arg;
>> __kernel_timespec* ts = (...)(arg + offset);
> 
> But then it's kind of hard to know which, if any, of them are set... I
> did think about this, and any solution seemed worse than just having the
> extra indirection.

struct io_uring_getevents_arg {
	sigset_t sigmask;
	u32 mask;
	struct __kernel_timespec ts;
};

if size > sizeof(sigmask), then use mask to determine that.
Though, not sure how horrid the rest of the code would be.

> 
> Yeah, not doing the extra indirection would save a copy, but don't think
> it's worth it for this path.

I much more don't like branching like IORING_ENTER_GETEVENTS_TIMEOUT, from
conceptual point. I may try it out to see how it looks like while it's still
for-next.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 20:16           ` Pavel Begunkov
@ 2020-11-04 20:28             ` Jens Axboe
  2020-11-04 20:50               ` Jens Axboe
  2020-11-04 21:20               ` Pavel Begunkov
  0 siblings, 2 replies; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 20:28 UTC (permalink / raw)
  To: Pavel Begunkov, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/4/20 1:16 PM, Pavel Begunkov wrote:
> On 04/11/2020 19:34, Jens Axboe wrote:
>> On 11/4/20 12:27 PM, Pavel Begunkov wrote:
>>> On 04/11/2020 18:32, Jens Axboe wrote:
>>>> On 11/4/20 10:50 AM, Jens Axboe wrote:
>>>>> +struct io_uring_getevents_arg {
>>>>> +	sigset_t *sigmask;
>>>>> +	struct __kernel_timespec *ts;
>>>>> +};
>>>>> +
>>>>
>>>> I missed that this is still not right, I did bring it up in your last
>>>> posting though - you can't have pointers as a user API, since the size
>>>> of the pointer will vary depending on whether this is a 32-bit or 64-bit
>>>> arch (or 32-bit app running on 64-bit kernel).
>>>
>>> Maybe it would be better 
>>>
>>> 1) to kill this extra indirection?
>>>
>>> struct io_uring_getevents_arg {
>>> -	sigset_t *sigmask;
>>> -	struct __kernel_timespec *ts;
>>> +	sigset_t sigmask;
>>> +	struct __kernel_timespec ts;
>>> };
>>>
>>> then,
>>>
>>> sigset_t *sig = (...)arg;
>>> __kernel_timespec* ts = (...)(arg + offset);
>>
>> But then it's kind of hard to know which, if any, of them are set... I
>> did think about this, and any solution seemed worse than just having the
>> extra indirection.
> 
> struct io_uring_getevents_arg {
> 	sigset_t sigmask;
> 	u32 mask;
> 	struct __kernel_timespec ts;
> };
> 
> if size > sizeof(sigmask), then use mask to determine that.
> Though, not sure how horrid the rest of the code would be.

I'm not saying it's not possible, just that I think the end result would
be worse in terms of both kernel code and how the user applications (or
liburing) would need to use it. I'd rather sacrifice an extra copy for
something that's straight forward (and logical) to use, rather than
needing weird setups or hoops to jump through. And this mask vs
sizeof(mask) thing seems pretty horrendeous to me :-)

>> Yeah, not doing the extra indirection would save a copy, but don't think
>> it's worth it for this path.
> 
> I much more don't like branching like IORING_ENTER_GETEVENTS_TIMEOUT,
> from conceptual point. I may try it out to see how it looks like while
> it's still for-next.

One thing I think we should change is the name,
IORING_ENTER_GETEVENTS_TIMEOUT will quickly be a bad name if we end up
adding just one more thing to the struct. Would be better to call it
IORING_ENTER_EXTRA_DATA or something, meaning that the sigmask pointer
is a pointer to the aux data instead of a sigmask. Better name
suggestions welcome...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 20:28             ` Jens Axboe
@ 2020-11-04 20:50               ` Jens Axboe
  2020-11-04 21:20               ` Pavel Begunkov
  1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 20:50 UTC (permalink / raw)
  To: Pavel Begunkov, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/4/20 1:28 PM, Jens Axboe wrote:
> On 11/4/20 1:16 PM, Pavel Begunkov wrote:
>> On 04/11/2020 19:34, Jens Axboe wrote:
>>> On 11/4/20 12:27 PM, Pavel Begunkov wrote:
>>>> On 04/11/2020 18:32, Jens Axboe wrote:
>>>>> On 11/4/20 10:50 AM, Jens Axboe wrote:
>>>>>> +struct io_uring_getevents_arg {
>>>>>> +	sigset_t *sigmask;
>>>>>> +	struct __kernel_timespec *ts;
>>>>>> +};
>>>>>> +
>>>>>
>>>>> I missed that this is still not right, I did bring it up in your last
>>>>> posting though - you can't have pointers as a user API, since the size
>>>>> of the pointer will vary depending on whether this is a 32-bit or 64-bit
>>>>> arch (or 32-bit app running on 64-bit kernel).
>>>>
>>>> Maybe it would be better 
>>>>
>>>> 1) to kill this extra indirection?
>>>>
>>>> struct io_uring_getevents_arg {
>>>> -	sigset_t *sigmask;
>>>> -	struct __kernel_timespec *ts;
>>>> +	sigset_t sigmask;
>>>> +	struct __kernel_timespec ts;
>>>> };
>>>>
>>>> then,
>>>>
>>>> sigset_t *sig = (...)arg;
>>>> __kernel_timespec* ts = (...)(arg + offset);
>>>
>>> But then it's kind of hard to know which, if any, of them are set... I
>>> did think about this, and any solution seemed worse than just having the
>>> extra indirection.
>>
>> struct io_uring_getevents_arg {
>> 	sigset_t sigmask;
>> 	u32 mask;
>> 	struct __kernel_timespec ts;
>> };
>>
>> if size > sizeof(sigmask), then use mask to determine that.
>> Though, not sure how horrid the rest of the code would be.
> 
> I'm not saying it's not possible, just that I think the end result would
> be worse in terms of both kernel code and how the user applications (or
> liburing) would need to use it. I'd rather sacrifice an extra copy for
> something that's straight forward (and logical) to use, rather than
> needing weird setups or hoops to jump through. And this mask vs
> sizeof(mask) thing seems pretty horrendeous to me :-)
> 
>>> Yeah, not doing the extra indirection would save a copy, but don't think
>>> it's worth it for this path.
>>
>> I much more don't like branching like IORING_ENTER_GETEVENTS_TIMEOUT,
>> from conceptual point. I may try it out to see how it looks like while
>> it's still for-next.
> 
> One thing I think we should change is the name,
> IORING_ENTER_GETEVENTS_TIMEOUT will quickly be a bad name if we end up
> adding just one more thing to the struct. Would be better to call it
> IORING_ENTER_EXTRA_DATA or something, meaning that the sigmask pointer
> is a pointer to the aux data instead of a sigmask. Better name
> suggestions welcome...

I'd be inclined to do something like the below:

- Rename it to IORING_ENTER_SIG_IS_DATA, which I think is more future
  proof and explains it too. Ditto for the feature flag.

- Move the checking and getting to under GETEVENTS. This removes a weird
  case where you'd get EINVAL if IORING_ENTER_SIG_IS_DATA is set but
  IORING_ENTER_GETEVENTS isn't. We didn't previously fail a
  non-getevents call if eg sigmask was set, so don't think we should add
  this case. Only downside here is that if we fail the validation, we'll
  only submit and return the submit count. Should be fine, as we'd end
  up with another enter and return the error there.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8439cda54e21..694a87807ea1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9146,6 +9146,29 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 	finish_wait(&ctx->sqo_sq_wait, &wait);
 }
 
+static int io_get_sig_is_data(unsigned flags, const void __user *argp,
+			      struct __kernel_timespec __user **ts,
+			      const sigset_t __user **sig, size_t *sigsz)
+{
+	struct io_uring_getevents_arg arg;
+
+	/* deal with IORING_ENTER_SIG_IS_DATA */
+	if (flags & IORING_ENTER_SIG_IS_DATA) {
+		if (*sigsz != sizeof(arg))
+			return -EINVAL;
+		if (copy_from_user(&arg, argp, sizeof(arg)))
+			return -EFAULT;
+		*sig = u64_to_user_ptr(arg.sigmask);
+		*sigsz = arg.sigmask_sz;
+		*ts = u64_to_user_ptr(arg.ts);
+	} else {
+		*sig = (const sigset_t __user *) argp;
+		*ts = NULL;
+	}
+
+	return 0;
+}
+
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		u32, min_complete, u32, flags, const void __user *, argp,
 		size_t, sigsz)
@@ -9154,32 +9177,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	long ret = -EBADF;
 	int submitted = 0;
 	struct fd f;
-	const sigset_t __user *sig;
-	struct __kernel_timespec __user *ts;
-	struct io_uring_getevents_arg arg;
 
 	io_run_task_work();
 
 	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			IORING_ENTER_SQ_WAIT | IORING_ENTER_GETEVENTS_TIMEOUT))
+			IORING_ENTER_SQ_WAIT | IORING_ENTER_SIG_IS_DATA))
 		return -EINVAL;
 
-	/* deal with IORING_ENTER_GETEVENTS_TIMEOUT */
-	if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) {
-		if (!(flags & IORING_ENTER_GETEVENTS))
-			return -EINVAL;
-		if (sigsz != sizeof(arg))
-			return -EINVAL;
-		if (copy_from_user(&arg, argp, sizeof(arg)))
-			return -EFAULT;
-		sig = u64_to_user_ptr(arg.sigmask);
-		sigsz = arg.sigmask_sz;
-		ts = u64_to_user_ptr(arg.ts);
-	} else {
-		sig = (const sigset_t __user *)argp;
-		ts = NULL;
-	}
-
 	f = fdget(fd);
 	if (!f.file)
 		return -EBADF;
@@ -9223,6 +9227,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
+		const sigset_t __user *sig;
+		struct __kernel_timespec __user *ts;
+
+		ret = io_get_sig_is_data(flags, argp, &ts, &sig, &sigsz);
+		if (unlikely(ret))
+			goto out;
+
 		min_complete = min(min_complete, ctx->cq_entries);
 
 		/*
@@ -9598,7 +9609,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
-			IORING_FEAT_GETEVENTS_TIMEOUT;
+			IORING_FEAT_SIG_IS_DATA;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..0fa095347fb6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -317,7 +317,7 @@ asmlinkage long sys_io_uring_setup(u32 entries,
 				struct io_uring_params __user *p);
 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
 				u32 min_complete, u32 flags,
-				const sigset_t __user *sig, size_t sigsz);
+				const void __user *argp, size_t sigsz);
 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
 				void __user *arg, unsigned int nr_args);
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1a92985a9ee8..4832addccfa6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -231,7 +231,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 #define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_GETEVENTS_TIMEOUT	(1U << 3)
+#define IORING_ENTER_SIG_IS_DATA	(1U << 3)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -260,7 +260,7 @@ struct io_uring_params {
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
 #define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
-#define IORING_FEAT_GETEVENTS_TIMEOUT	(1U << 8)
+#define IORING_FEAT_SIG_IS_DATA		(1U << 8)
 
 /*
  * io_uring_register(2) opcodes and arguments

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 20:28             ` Jens Axboe
  2020-11-04 20:50               ` Jens Axboe
@ 2020-11-04 21:20               ` Pavel Begunkov
  2020-11-04 21:27                 ` Jens Axboe
  1 sibling, 1 reply; 12+ messages in thread
From: Pavel Begunkov @ 2020-11-04 21:20 UTC (permalink / raw)
  To: Jens Axboe, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 04/11/2020 20:28, Jens Axboe wrote:
> On 11/4/20 1:16 PM, Pavel Begunkov wrote:
>> On 04/11/2020 19:34, Jens Axboe wrote:
>>> On 11/4/20 12:27 PM, Pavel Begunkov wrote:
>>>> On 04/11/2020 18:32, Jens Axboe wrote:
>>>>> On 11/4/20 10:50 AM, Jens Axboe wrote:
>>>>>> +struct io_uring_getevents_arg {
>>>>>> +	sigset_t *sigmask;
>>>>>> +	struct __kernel_timespec *ts;
>>>>>> +};
>>>>>> +
>>>>>
>>>>> I missed that this is still not right, I did bring it up in your last
>>>>> posting though - you can't have pointers as a user API, since the size
>>>>> of the pointer will vary depending on whether this is a 32-bit or 64-bit
>>>>> arch (or 32-bit app running on 64-bit kernel).
>>>>
>>>> Maybe it would be better 
>>>>
>>>> 1) to kill this extra indirection?
>>>>
>>>> struct io_uring_getevents_arg {
>>>> -	sigset_t *sigmask;
>>>> -	struct __kernel_timespec *ts;
>>>> +	sigset_t sigmask;
>>>> +	struct __kernel_timespec ts;
>>>> };
>>>>
>>>> then,
>>>>
>>>> sigset_t *sig = (...)arg;
>>>> __kernel_timespec* ts = (...)(arg + offset);
>>>
>>> But then it's kind of hard to know which, if any, of them are set... I
>>> did think about this, and any solution seemed worse than just having the
>>> extra indirection.
>>
>> struct io_uring_getevents_arg {
>> 	sigset_t sigmask;
>> 	u32 mask;
>> 	struct __kernel_timespec ts;
>> };
>>
>> if size > sizeof(sigmask), then use mask to determine that.
>> Though, not sure how horrid the rest of the code would be.
> 
> I'm not saying it's not possible, just that I think the end result would
> be worse in terms of both kernel code and how the user applications (or
> liburing) would need to use it. I'd rather sacrifice an extra copy for
> something that's straight forward (and logical) to use, rather than
> needing weird setups or hoops to jump through. And this mask vs
> sizeof(mask) thing seems pretty horrendeous to me :-)

If you think so, I'll spare my time then :)

> 
>>> Yeah, not doing the extra indirection would save a copy, but don't think
>>> it's worth it for this path.
>>
>> I much more don't like branching like IORING_ENTER_GETEVENTS_TIMEOUT,
>> from conceptual point. I may try it out to see how it looks like while
>> it's still for-next.
> 
> One thing I think we should change is the name,
> IORING_ENTER_GETEVENTS_TIMEOUT will quickly be a bad name if we end up
> adding just one more thing to the struct. Would be better to call it
> IORING_ENTER_EXTRA_DATA or something, meaning that the sigmask pointer
> is a pointer to the aux data instead of a sigmask. Better name
> suggestions welcome...

_EXT_ARG from extended

Also, a minor one -- s/sigsz/argsz/

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3 RESEND] io_uring: add timeout support for io_uring_enter()
  2020-11-04 21:20               ` Pavel Begunkov
@ 2020-11-04 21:27                 ` Jens Axboe
  0 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2020-11-04 21:27 UTC (permalink / raw)
  To: Pavel Begunkov, Hao Xu; +Cc: io-uring, metze, Jiufei Xue, Joseph Qi

On 11/4/20 2:20 PM, Pavel Begunkov wrote:
> On 04/11/2020 20:28, Jens Axboe wrote:
>> On 11/4/20 1:16 PM, Pavel Begunkov wrote:
>>> On 04/11/2020 19:34, Jens Axboe wrote:
>>>> On 11/4/20 12:27 PM, Pavel Begunkov wrote:
>>>>> On 04/11/2020 18:32, Jens Axboe wrote:
>>>>>> On 11/4/20 10:50 AM, Jens Axboe wrote:
>>>>>>> +struct io_uring_getevents_arg {
>>>>>>> +	sigset_t *sigmask;
>>>>>>> +	struct __kernel_timespec *ts;
>>>>>>> +};
>>>>>>> +
>>>>>>
>>>>>> I missed that this is still not right, I did bring it up in your last
>>>>>> posting though - you can't have pointers as a user API, since the size
>>>>>> of the pointer will vary depending on whether this is a 32-bit or 64-bit
>>>>>> arch (or 32-bit app running on 64-bit kernel).
>>>>>
>>>>> Maybe it would be better 
>>>>>
>>>>> 1) to kill this extra indirection?
>>>>>
>>>>> struct io_uring_getevents_arg {
>>>>> -	sigset_t *sigmask;
>>>>> -	struct __kernel_timespec *ts;
>>>>> +	sigset_t sigmask;
>>>>> +	struct __kernel_timespec ts;
>>>>> };
>>>>>
>>>>> then,
>>>>>
>>>>> sigset_t *sig = (...)arg;
>>>>> __kernel_timespec* ts = (...)(arg + offset);
>>>>
>>>> But then it's kind of hard to know which, if any, of them are set... I
>>>> did think about this, and any solution seemed worse than just having the
>>>> extra indirection.
>>>
>>> struct io_uring_getevents_arg {
>>> 	sigset_t sigmask;
>>> 	u32 mask;
>>> 	struct __kernel_timespec ts;
>>> };
>>>
>>> if size > sizeof(sigmask), then use mask to determine that.
>>> Though, not sure how horrid the rest of the code would be.
>>
>> I'm not saying it's not possible, just that I think the end result would
>> be worse in terms of both kernel code and how the user applications (or
>> liburing) would need to use it. I'd rather sacrifice an extra copy for
>> something that's straight forward (and logical) to use, rather than
>> needing weird setups or hoops to jump through. And this mask vs
>> sizeof(mask) thing seems pretty horrendeous to me :-)
> 
> If you think so, I'll spare my time then :)
> 
>>
>>>> Yeah, not doing the extra indirection would save a copy, but don't think
>>>> it's worth it for this path.
>>>
>>> I much more don't like branching like IORING_ENTER_GETEVENTS_TIMEOUT,
>>> from conceptual point. I may try it out to see how it looks like while
>>> it's still for-next.
>>
>> One thing I think we should change is the name,
>> IORING_ENTER_GETEVENTS_TIMEOUT will quickly be a bad name if we end up
>> adding just one more thing to the struct. Would be better to call it
>> IORING_ENTER_EXTRA_DATA or something, meaning that the sigmask pointer
>> is a pointer to the aux data instead of a sigmask. Better name
>> suggestions welcome...
> 
> _EXT_ARG from extended

Yeah I like that, I'll update it

> Also, a minor one -- s/sigsz/argsz/

Yes, might as well.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-11-04 21:27 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-11-02  8:50 [PATCH v3] io_uring: add timeout support for io_uring_enter() Hao Xu
2020-11-03  2:54 ` [PATCH v3 RESEND] " Hao Xu
2020-11-04 17:50   ` Jens Axboe
2020-11-04 18:32     ` Jens Axboe
2020-11-04 19:06       ` Jens Axboe
2020-11-04 19:27       ` Pavel Begunkov
2020-11-04 19:34         ` Jens Axboe
2020-11-04 20:16           ` Pavel Begunkov
2020-11-04 20:28             ` Jens Axboe
2020-11-04 20:50               ` Jens Axboe
2020-11-04 21:20               ` Pavel Begunkov
2020-11-04 21:27                 ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox