public inbox for [email protected]
 help / color / mirror / Atom feed
* [PATCH 5.12 0/3] sqpoll fixes/cleanups
@ 2021-03-10 13:13 Pavel Begunkov
  2021-03-10 13:13 ` [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle Pavel Begunkov
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Pavel Begunkov @ 2021-03-10 13:13 UTC (permalink / raw)
  To: Jens Axboe, io-uring

All can go 5.12, but 1/3 is must have, 2/3 is much preferred to land
5.12. 3/3 is meh.

Pavel Begunkov (3):
  io_uring: fix invalid ctx->sq_thread_idle
  io_uring: remove indirect ctx into sqo injection
  io_uring: simplify io_sqd_update_thread_idle()

 fs/io_uring.c | 41 ++++++++---------------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle
  2021-03-10 13:13 [PATCH 5.12 0/3] sqpoll fixes/cleanups Pavel Begunkov
@ 2021-03-10 13:13 ` Pavel Begunkov
  2021-03-10 13:56   ` Stefan Metzmacher
  2021-03-10 13:13 ` [PATCH 2/3] io_uring: remove indirect ctx into sqo injection Pavel Begunkov
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Pavel Begunkov @ 2021-03-10 13:13 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We have to set ctx->sq_thread_idle before adding a ring to an SQ task,
otherwise sqd races for seeing zero and accounting it as such.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 896a7845447c..0b39c3818809 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7827,14 +7827,14 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 
 		ctx->sq_creds = get_current_cred();
 		ctx->sq_data = sqd;
-		io_sq_thread_park(sqd);
-		list_add(&ctx->sqd_list, &sqd->ctx_new_list);
-		io_sq_thread_unpark(sqd);
-
 		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
 		if (!ctx->sq_thread_idle)
 			ctx->sq_thread_idle = HZ;
 
+		io_sq_thread_park(sqd);
+		list_add(&ctx->sqd_list, &sqd->ctx_new_list);
+		io_sq_thread_unpark(sqd);
+
 		if (sqd->thread)
 			return 0;
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/3] io_uring: remove indirect ctx into sqo injection
  2021-03-10 13:13 [PATCH 5.12 0/3] sqpoll fixes/cleanups Pavel Begunkov
  2021-03-10 13:13 ` [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle Pavel Begunkov
@ 2021-03-10 13:13 ` Pavel Begunkov
  2021-03-10 13:13 ` [PATCH 3/3] io_uring: simplify io_sqd_update_thread_idle() Pavel Begunkov
  2021-03-10 14:38 ` [PATCH 5.12 0/3] sqpoll fixes/cleanups Jens Axboe
  3 siblings, 0 replies; 17+ messages in thread
From: Pavel Begunkov @ 2021-03-10 13:13 UTC (permalink / raw)
  To: Jens Axboe, io-uring

We use ->ctx_new_list to notify sqo about new ctx pending, then sqo
should stop and splice it to its sqd->ctx_list, paired with
->sq_thread_comp.

The last one is broken because nobody reinitialises it, and trying to
fix it would only add more complexity and bugs. And the first isn't
really needed as is done under park(), that protects from races well.
Add ctx into sqd->ctx_list directly (under park()), it's much simpler
and allows to kill both, ctx_new_list and sq_thread_comp.

note: apparently there is no real problem at the moment, because
sq_thread_comp is used only by io_sq_thread_finish() followed by
parking, where list_del(&ctx->sqd_list) removes it well regardless
whether it's in the new or the active list.

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0b39c3818809..d2a26faa3bda 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -262,7 +262,6 @@ struct io_sq_data {
 
 	/* ctx's that are using this sqd */
 	struct list_head	ctx_list;
-	struct list_head	ctx_new_list;
 
 	struct task_struct	*thread;
 	struct wait_queue_head	wait;
@@ -398,7 +397,6 @@ struct io_ring_ctx {
 	struct user_struct	*user;
 
 	struct completion	ref_comp;
-	struct completion	sq_thread_comp;
 
 #if defined(CONFIG_UNIX)
 	struct socket		*ring_sock;
@@ -1136,7 +1134,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_waitqueue_head(&ctx->cq_wait);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	init_completion(&ctx->ref_comp);
-	init_completion(&ctx->sq_thread_comp);
 	idr_init(&ctx->io_buffer_idr);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -6632,19 +6629,6 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 	sqd->sq_thread_idle = sq_thread_idle;
 }
 
-static void io_sqd_init_new(struct io_sq_data *sqd)
-{
-	struct io_ring_ctx *ctx;
-
-	while (!list_empty(&sqd->ctx_new_list)) {
-		ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
-		list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
-		complete(&ctx->sq_thread_comp);
-	}
-
-	io_sqd_update_thread_idle(sqd);
-}
-
 static int io_sq_thread(void *data)
 {
 	struct io_sq_data *sqd = data;
@@ -6675,11 +6659,8 @@ static int io_sq_thread(void *data)
 			up_read(&sqd->rw_lock);
 			cond_resched();
 			down_read(&sqd->rw_lock);
-			continue;
-		}
-		if (unlikely(!list_empty(&sqd->ctx_new_list))) {
-			io_sqd_init_new(sqd);
 			timeout = jiffies + sqd->sq_thread_idle;
+			continue;
 		}
 		if (fatal_signal_pending(current)) {
 			set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
@@ -7097,9 +7078,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
 
 	if (sqd) {
 		complete(&sqd->startup);
-		if (sqd->thread)
-			wait_for_completion(&ctx->sq_thread_comp);
-
 		io_sq_thread_park(sqd);
 		list_del(&ctx->sqd_list);
 		io_sqd_update_thread_idle(sqd);
@@ -7151,7 +7129,6 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
 
 	refcount_set(&sqd->refs, 1);
 	INIT_LIST_HEAD(&sqd->ctx_list);
-	INIT_LIST_HEAD(&sqd->ctx_new_list);
 	init_rwsem(&sqd->rw_lock);
 	init_waitqueue_head(&sqd->wait);
 	init_completion(&sqd->startup);
@@ -7832,7 +7809,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 			ctx->sq_thread_idle = HZ;
 
 		io_sq_thread_park(sqd);
-		list_add(&ctx->sqd_list, &sqd->ctx_new_list);
+		list_add(&ctx->sqd_list, &sqd->ctx_list);
+		io_sqd_update_thread_idle(sqd);
 		io_sq_thread_unpark(sqd);
 
 		if (sqd->thread)
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/3] io_uring: simplify io_sqd_update_thread_idle()
  2021-03-10 13:13 [PATCH 5.12 0/3] sqpoll fixes/cleanups Pavel Begunkov
  2021-03-10 13:13 ` [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle Pavel Begunkov
  2021-03-10 13:13 ` [PATCH 2/3] io_uring: remove indirect ctx into sqo injection Pavel Begunkov
@ 2021-03-10 13:13 ` Pavel Begunkov
  2021-03-10 14:38 ` [PATCH 5.12 0/3] sqpoll fixes/cleanups Jens Axboe
  3 siblings, 0 replies; 17+ messages in thread
From: Pavel Begunkov @ 2021-03-10 13:13 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Use a more comprehensible() max instead of hand coding it with ifs in
io_sqd_update_thread_idle().

Signed-off-by: Pavel Begunkov <[email protected]>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2a26faa3bda..42b2ba8e0f55 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6621,11 +6621,8 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 	struct io_ring_ctx *ctx;
 	unsigned sq_thread_idle = 0;
 
-	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-		if (sq_thread_idle < ctx->sq_thread_idle)
-			sq_thread_idle = ctx->sq_thread_idle;
-	}
-
+	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
 	sqd->sq_thread_idle = sq_thread_idle;
 }
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle
  2021-03-10 13:13 ` [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle Pavel Begunkov
@ 2021-03-10 13:56   ` Stefan Metzmacher
  2021-03-11 10:49     ` Stefan Metzmacher
  2021-03-11 11:18     ` Pavel Begunkov
  0 siblings, 2 replies; 17+ messages in thread
From: Stefan Metzmacher @ 2021-03-10 13:56 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring


Hi Pavel,

> We have to set ctx->sq_thread_idle before adding a ring to an SQ task,
> otherwise sqd races for seeing zero and accounting it as such.
> 
> Signed-off-by: Pavel Begunkov <[email protected]>
> ---
>  fs/io_uring.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 896a7845447c..0b39c3818809 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -7827,14 +7827,14 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
>  
>  		ctx->sq_creds = get_current_cred();
>  		ctx->sq_data = sqd;
> -		io_sq_thread_park(sqd);
> -		list_add(&ctx->sqd_list, &sqd->ctx_new_list);
> -		io_sq_thread_unpark(sqd);
> -
>  		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
>  		if (!ctx->sq_thread_idle)
>  			ctx->sq_thread_idle = HZ;
>  
> +		io_sq_thread_park(sqd);
> +		list_add(&ctx->sqd_list, &sqd->ctx_new_list);
> +		io_sq_thread_unpark(sqd);

I wondered about the exact same change this morning, while researching
the IORING_SETUP_ATTACH_WQ behavior :-)

It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
As you introduced that flag, can you summaries it's behavior (and changes)
over time (over the releases).

I'm wondering if ctx->sq_creds is really the only thing we need to take care of.

Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
As mm, files and other things may differ now between sqe producer and the sq_thread.

metze

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 5.12 0/3] sqpoll fixes/cleanups
  2021-03-10 13:13 [PATCH 5.12 0/3] sqpoll fixes/cleanups Pavel Begunkov
                   ` (2 preceding siblings ...)
  2021-03-10 13:13 ` [PATCH 3/3] io_uring: simplify io_sqd_update_thread_idle() Pavel Begunkov
@ 2021-03-10 14:38 ` Jens Axboe
  3 siblings, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2021-03-10 14:38 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/10/21 6:13 AM, Pavel Begunkov wrote:
> All can go 5.12, but 1/3 is must have, 2/3 is much preferred to land
> 5.12. 3/3 is meh.
> 
> Pavel Begunkov (3):
>   io_uring: fix invalid ctx->sq_thread_idle
>   io_uring: remove indirect ctx into sqo injection
>   io_uring: simplify io_sqd_update_thread_idle()
> 
>  fs/io_uring.c | 41 ++++++++---------------------------------
>  1 file changed, 8 insertions(+), 33 deletions(-)

Thanks, added 1+2 for 5.12, and 3 for 5.13.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle
  2021-03-10 13:56   ` Stefan Metzmacher
@ 2021-03-11 10:49     ` Stefan Metzmacher
  2021-03-11 11:18     ` Pavel Begunkov
  1 sibling, 0 replies; 17+ messages in thread
From: Stefan Metzmacher @ 2021-03-11 10:49 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring

> I wondered about the exact same change this morning, while researching
> the IORING_SETUP_ATTACH_WQ behavior :-)
> 
> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
> As you introduced that flag, can you summaries it's behavior (and changes)
> over time (over the releases).
> 
> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
> 
> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
> As mm, files and other things may differ now between sqe producer and the sq_thread.

I'll start a new thread for this...

metze


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle
  2021-03-10 13:56   ` Stefan Metzmacher
  2021-03-11 10:49     ` Stefan Metzmacher
@ 2021-03-11 11:18     ` Pavel Begunkov
  2021-03-11 11:46       ` IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle) Stefan Metzmacher
  1 sibling, 1 reply; 17+ messages in thread
From: Pavel Begunkov @ 2021-03-11 11:18 UTC (permalink / raw)
  To: Stefan Metzmacher, Jens Axboe, io-uring

On 10/03/2021 13:56, Stefan Metzmacher wrote:
> 
> Hi Pavel,
> 
> I wondered about the exact same change this morning, while researching
> the IORING_SETUP_ATTACH_WQ behavior :-)
> 
> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
> As you introduced that flag, can you summaries it's behavior (and changes)
> over time (over the releases).

Not sure I remember the story in details, but from the beginning it was
for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
only about SQPOLL sharing, because of the recent io-wq changes that made
it per-task and shared by default.

In all cases it should be checking the passed in file, that should retain
the old behaviour of failing setup if the flag is set but wq_fd is not valid.

> 
> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.

io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
all the resources of the creator (on the moment of io-wq creation). Off
ATTACH_WQ topic, but that's almost matches what it has been before, and
with dropped unshare bit, should be totally same.

Regarding SQPOLL, it was always using resources of the first task, so
those are just reaped of from it, and not only some particular like
mm/files but all of them, like fork does, so should be safer.

Creds are just a special case because of that personality stuff, at least
if we add back iowq unshare handling.

> 
> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?

Have no clue.

> As mm, files and other things may differ now between sqe producer and the sq_thread.

It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
but right, for the sharing case those may be different b/w ctx, so looks
like a regression to me

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 11:18     ` Pavel Begunkov
@ 2021-03-11 11:46       ` Stefan Metzmacher
  2021-03-11 12:02         ` Stefan Metzmacher
                           ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Stefan Metzmacher @ 2021-03-11 11:46 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring


Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>
>> Hi Pavel,
>>
>> I wondered about the exact same change this morning, while researching
>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>
>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>> As you introduced that flag, can you summaries it's behavior (and changes)
>> over time (over the releases).
> 
> Not sure I remember the story in details, but from the beginning it was
> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
> only about SQPOLL sharing, because of the recent io-wq changes that made
> it per-task and shared by default.
> 
> In all cases it should be checking the passed in file, that should retain
> the old behaviour of failing setup if the flag is set but wq_fd is not valid.

Thanks, that's what I also found so far, see below for more findings.

>>
>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
> 
> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
> all the resources of the creator (on the moment of io-wq creation). Off
> ATTACH_WQ topic, but that's almost matches what it has been before, and
> with dropped unshare bit, should be totally same.
> 
> Regarding SQPOLL, it was always using resources of the first task, so
> those are just reaped of from it, and not only some particular like
> mm/files but all of them, like fork does, so should be safer.
> 
> Creds are just a special case because of that personality stuff, at least
> if we add back iowq unshare handling.
> 
>>
>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
> 
> Have no clue.
> 
>> As mm, files and other things may differ now between sqe producer and the sq_thread.
> 
> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
> but right, for the sharing case those may be different b/w ctx, so looks
> like a regression to me

Good. I'll try to explore a possible way out below.

Ok, I'm continuing the thread here (just pasting the mail I already started to write :-)

I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.

The current logic in io_sq_offload_create() is this:

+       /* Retain compatibility with failing for an invalid attach attempt */
+       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+                               IORING_SETUP_ATTACH_WQ) {
+               struct fd f;
+
+               f = fdget(p->wq_fd);
+               if (!f.file)
+                       return -ENXIO;
+               if (f.file->f_op != &io_uring_fops) {
+                       fdput(f);
+                       return -EINVAL;
+               }
+               fdput(f);
+       }

That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL) is completely
ignored (except that we still simulate the -ENXIO and -EINVAL  cases), correct?
(You already agreed on that above :-)

The reason for this is that io_wq is no longer maintained per io_ring_ctx,
but instead it is now global per io_uring_task.
Which means each userspace thread (or the sq_thread) has its own io_uring_task and
thus its own io_wq.

Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still allow attaching
to the sq_thread of a different io_ring_ctx. The sq_thread runs in the context of
the io_uring_setup() syscall that created it. We used to switch current->mm, current->files
and other things before calling __io_sq_thread() before, but we no longer do that.
And this seems to be security problem to me, as it's now possible for the attached
io_ring_ctx to start sqe's copying the whole address space of the donator into
a registered fixed file of the attached process.

As we already ignore IORING_SETUP_ATTACH_WQ without IORING_SETUP_SQPOLL, what about
ignoring it as well if the attaching task uses different ->mm, ->files, ...
So IORING_SETUP_ATTACH_WQ would only have any effect if the task calling io_uring_setup()
runs in the same context (except of the creds) as the existing sq_thread, which means it would work
if multiple userspace threads of the same userspace process want to share the sq_thread and its
io_wq. Everything else would be stupid (similar to the unshare() cases).
But as this has worked before, we just silently ignore IORING_SETUP_ATTACH_WQ is
we find a context mismatch and let io_uring_setup() silently create a new sq_thread.

What do you think?

metze

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 11:46       ` IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle) Stefan Metzmacher
@ 2021-03-11 12:02         ` Stefan Metzmacher
  2021-03-11 15:28           ` Jens Axboe
  2021-03-11 12:27         ` Pavel Begunkov
  2021-03-11 15:27         ` Jens Axboe
  2 siblings, 1 reply; 17+ messages in thread
From: Stefan Metzmacher @ 2021-03-11 12:02 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring


Am 11.03.21 um 12:46 schrieb Stefan Metzmacher:
> 
> Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
>> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>>
>>> Hi Pavel,
>>>
>>> I wondered about the exact same change this morning, while researching
>>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>>
>>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>>> As you introduced that flag, can you summaries it's behavior (and changes)
>>> over time (over the releases).
>>
>> Not sure I remember the story in details, but from the beginning it was
>> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
>> only about SQPOLL sharing, because of the recent io-wq changes that made
>> it per-task and shared by default.
>>
>> In all cases it should be checking the passed in file, that should retain
>> the old behaviour of failing setup if the flag is set but wq_fd is not valid.
> 
> Thanks, that's what I also found so far, see below for more findings.
> 
>>>
>>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
>>
>> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
>> all the resources of the creator (on the moment of io-wq creation). Off
>> ATTACH_WQ topic, but that's almost matches what it has been before, and
>> with dropped unshare bit, should be totally same.
>>
>> Regarding SQPOLL, it was always using resources of the first task, so
>> those are just reaped of from it, and not only some particular like
>> mm/files but all of them, like fork does, so should be safer.
>>
>> Creds are just a special case because of that personality stuff, at least
>> if we add back iowq unshare handling.
>>
>>>
>>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
>>
>> Have no clue.
>>
>>> As mm, files and other things may differ now between sqe producer and the sq_thread.
>>
>> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
>> but right, for the sharing case those may be different b/w ctx, so looks
>> like a regression to me
> 
> Good. I'll try to explore a possible way out below.
> 
> Ok, I'm continuing the thread here (just pasting the mail I already started to write :-)
> 
> I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.
> 
> The current logic in io_sq_offload_create() is this:
> 
> +       /* Retain compatibility with failing for an invalid attach attempt */
> +       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
> +                               IORING_SETUP_ATTACH_WQ) {
> +               struct fd f;
> +
> +               f = fdget(p->wq_fd);
> +               if (!f.file)
> +                       return -ENXIO;
> +               if (f.file->f_op != &io_uring_fops) {
> +                       fdput(f);
> +                       return -EINVAL;
> +               }
> +               fdput(f);
> +       }
> 
> That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL) is completely
> ignored (except that we still simulate the -ENXIO and -EINVAL  cases), correct?
> (You already agreed on that above :-)
> 
> The reason for this is that io_wq is no longer maintained per io_ring_ctx,
> but instead it is now global per io_uring_task.
> Which means each userspace thread (or the sq_thread) has its own io_uring_task and
> thus its own io_wq.
> 
> Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still allow attaching
> to the sq_thread of a different io_ring_ctx. The sq_thread runs in the context of
> the io_uring_setup() syscall that created it. We used to switch current->mm, current->files
> and other things before calling __io_sq_thread() before, but we no longer do that.
> And this seems to be security problem to me, as it's now possible for the attached
> io_ring_ctx to start sqe's copying the whole address space of the donator into
> a registered fixed file of the attached process.
> 
> As we already ignore IORING_SETUP_ATTACH_WQ without IORING_SETUP_SQPOLL, what about
> ignoring it as well if the attaching task uses different ->mm, ->files, ...
> So IORING_SETUP_ATTACH_WQ would only have any effect if the task calling io_uring_setup()
> runs in the same context (except of the creds) as the existing sq_thread, which means it would work
> if multiple userspace threads of the same userspace process want to share the sq_thread and its
> io_wq. Everything else would be stupid (similar to the unshare() cases).
> But as this has worked before, we just silently ignore IORING_SETUP_ATTACH_WQ is
> we find a context mismatch and let io_uring_setup() silently create a new sq_thread.

Or we completely ignore IORING_SETUP_ATTACH_WQ (execpt the error cases).

Then we can implement a new IORING_SETUP_ATTACH_SQ with new semantics,
that the existing sq_thread will be used as it and both sides now what it means to them.
We also add a new IORING_REGISTER_RESTRICTIONS/IORING_RESTRICTION_ALLOW_SQ_ATTACHMENTS
which prepares the first io_ring_ctx to allow others to attach.

Would that make sense?

metze

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 11:46       ` IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle) Stefan Metzmacher
  2021-03-11 12:02         ` Stefan Metzmacher
@ 2021-03-11 12:27         ` Pavel Begunkov
  2021-03-11 12:44           ` Stefan Metzmacher
  2021-03-11 15:27         ` Jens Axboe
  2 siblings, 1 reply; 17+ messages in thread
From: Pavel Begunkov @ 2021-03-11 12:27 UTC (permalink / raw)
  To: Stefan Metzmacher, Jens Axboe, io-uring

On 11/03/2021 11:46, Stefan Metzmacher wrote:
> Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
>> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>>
>>> Hi Pavel,
>>>
>>> I wondered about the exact same change this morning, while researching
>>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>>
>>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>>> As you introduced that flag, can you summaries it's behavior (and changes)
>>> over time (over the releases).
>>
>> Not sure I remember the story in details, but from the beginning it was
>> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
>> only about SQPOLL sharing, because of the recent io-wq changes that made
>> it per-task and shared by default.
>>
>> In all cases it should be checking the passed in file, that should retain
>> the old behaviour of failing setup if the flag is set but wq_fd is not valid.
> 
> Thanks, that's what I also found so far, see below for more findings.
> 
>>>
>>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
>>
>> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
>> all the resources of the creator (on the moment of io-wq creation). Off
>> ATTACH_WQ topic, but that's almost matches what it has been before, and
>> with dropped unshare bit, should be totally same.
>>
>> Regarding SQPOLL, it was always using resources of the first task, so
>> those are just reaped of from it, and not only some particular like
>> mm/files but all of them, like fork does, so should be safer.
>>
>> Creds are just a special case because of that personality stuff, at least
>> if we add back iowq unshare handling.
>>
>>>
>>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
>>
>> Have no clue.
>>
>>> As mm, files and other things may differ now between sqe producer and the sq_thread.
>>
>> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
>> but right, for the sharing case those may be different b/w ctx, so looks
>> like a regression to me
> 
> Good. I'll try to explore a possible way out below.
> 
> Ok, I'm continuing the thread here (just pasting the mail I already started to write :-)
> 
> I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.
> 
> The current logic in io_sq_offload_create() is this:
> 
> +       /* Retain compatibility with failing for an invalid attach attempt */
> +       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
> +                               IORING_SETUP_ATTACH_WQ) {
> +               struct fd f;
> +
> +               f = fdget(p->wq_fd);
> +               if (!f.file)
> +                       return -ENXIO;
> +               if (f.file->f_op != &io_uring_fops) {
> +                       fdput(f);
> +                       return -EINVAL;
> +               }
> +               fdput(f);
> +       }
> 
> That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL) is completely
> ignored (except that we still simulate the -ENXIO and -EINVAL  cases), correct?
> (You already agreed on that above :-)

Yep, and we do these -ENXIO and -EINVAL for SQPOLL as well.
 
> The reason for this is that io_wq is no longer maintained per io_ring_ctx,
> but instead it is now global per io_uring_task.
> Which means each userspace thread (or the sq_thread) has its own io_uring_task and
> thus its own io_wq.

Just for anyone out of context, it's per process/thread/struct task/etc.
struct io_uring_task is just a bit of a context attached to a task ever submitted
io_uring requests, and its' not some special kind of a task.

> Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still allow attaching
> to the sq_thread of a different io_ring_ctx. The sq_thread runs in the context of
> the io_uring_setup() syscall that created it. We used to switch current->mm, current->files
> and other things before calling __io_sq_thread() before, but we no longer do that.
> And this seems to be security problem to me, as it's now possible for the attached
> io_ring_ctx to start sqe's copying the whole address space of the donator into
> a registered fixed file of the attached process.

It's not as bad, because 1) you voluntarily passes fd and 2) requires privileges,
but it's a change of behaviour, which, well, can be exploited as you said.

> As we already ignore IORING_SETUP_ATTACH_WQ without IORING_SETUP_SQPOLL, what about
> ignoring it as well if the attaching task uses different ->mm, ->files, ...
> So IORING_SETUP_ATTACH_WQ would only have any effect if the task calling io_uring_setup()
> runs in the same context (except of the creds) as the existing sq_thread, which means it would work
> if multiple userspace threads of the same userspace process want to share the sq_thread and its
> io_wq. Everything else would be stupid (similar to the unshare() cases).
> But as this has worked before, we just silently ignore IORING_SETUP_ATTACH_WQ is
> we find a context mismatch and let io_uring_setup() silently create a new sq_thread.

options:
1. Return back all that acquire_mm_files. Not great, not as safe
as new io-wq, etc.

2. Completely ignore SQPOLL sharing. Performance regressions...

3. Do selected sharing. Maybe if thread group or so matches, should
be safer than just mm/files check (or any subset of possibly long
list). And there may be differences when the creator task do
unshare/etc., but can be patched up (from where the unshare hook came
in the first place).

I like 3) but 2) may do as well. The only performance problem I see
is for those who wanted to use it out of threads. E.g. there even
was a proposal to have per-CPU SQPOLL tasks and keep them per whole
system.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 12:27         ` Pavel Begunkov
@ 2021-03-11 12:44           ` Stefan Metzmacher
  2021-03-11 15:30             ` Jens Axboe
  0 siblings, 1 reply; 17+ messages in thread
From: Stefan Metzmacher @ 2021-03-11 12:44 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring


Am 11.03.21 um 13:27 schrieb Pavel Begunkov:
> On 11/03/2021 11:46, Stefan Metzmacher wrote:
>> Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
>>> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>>>
>>>> Hi Pavel,
>>>>
>>>> I wondered about the exact same change this morning, while researching
>>>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>>>
>>>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>>>> As you introduced that flag, can you summaries it's behavior (and changes)
>>>> over time (over the releases).
>>>
>>> Not sure I remember the story in details, but from the beginning it was
>>> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
>>> only about SQPOLL sharing, because of the recent io-wq changes that made
>>> it per-task and shared by default.
>>>
>>> In all cases it should be checking the passed in file, that should retain
>>> the old behaviour of failing setup if the flag is set but wq_fd is not valid.
>>
>> Thanks, that's what I also found so far, see below for more findings.
>>
>>>>
>>>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
>>>
>>> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
>>> all the resources of the creator (on the moment of io-wq creation). Off
>>> ATTACH_WQ topic, but that's almost matches what it has been before, and
>>> with dropped unshare bit, should be totally same.
>>>
>>> Regarding SQPOLL, it was always using resources of the first task, so
>>> those are just reaped of from it, and not only some particular like
>>> mm/files but all of them, like fork does, so should be safer.
>>>
>>> Creds are just a special case because of that personality stuff, at least
>>> if we add back iowq unshare handling.
>>>
>>>>
>>>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
>>>
>>> Have no clue.
>>>
>>>> As mm, files and other things may differ now between sqe producer and the sq_thread.
>>>
>>> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
>>> but right, for the sharing case those may be different b/w ctx, so looks
>>> like a regression to me
>>
>> Good. I'll try to explore a possible way out below.
>>
>> Ok, I'm continuing the thread here (just pasting the mail I already started to write :-)
>>
>> I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.
>>
>> The current logic in io_sq_offload_create() is this:
>>
>> +       /* Retain compatibility with failing for an invalid attach attempt */
>> +       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
>> +                               IORING_SETUP_ATTACH_WQ) {
>> +               struct fd f;
>> +
>> +               f = fdget(p->wq_fd);
>> +               if (!f.file)
>> +                       return -ENXIO;
>> +               if (f.file->f_op != &io_uring_fops) {
>> +                       fdput(f);
>> +                       return -EINVAL;
>> +               }
>> +               fdput(f);
>> +       }
>>
>> That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL) is completely
>> ignored (except that we still simulate the -ENXIO and -EINVAL  cases), correct?
>> (You already agreed on that above :-)
> 
> Yep, and we do these -ENXIO and -EINVAL for SQPOLL as well.
>  
>> The reason for this is that io_wq is no longer maintained per io_ring_ctx,
>> but instead it is now global per io_uring_task.
>> Which means each userspace thread (or the sq_thread) has its own io_uring_task and
>> thus its own io_wq.
> 
> Just for anyone out of context, it's per process/thread/struct task/etc.
> struct io_uring_task is just a bit of a context attached to a task ever submitted
> io_uring requests, and its' not some special kind of a task.
> 
>> Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still allow attaching
>> to the sq_thread of a different io_ring_ctx. The sq_thread runs in the context of
>> the io_uring_setup() syscall that created it. We used to switch current->mm, current->files
>> and other things before calling __io_sq_thread() before, but we no longer do that.
>> And this seems to be security problem to me, as it's now possible for the attached
>> io_ring_ctx to start sqe's copying the whole address space of the donator into
>> a registered fixed file of the attached process.
> 
> It's not as bad, because 1) you voluntarily passes fd and 2) requires privileges,
> but it's a change of behaviour, which, well, can be exploited as you said.

Yes, but pointers and other things may have a different meaning now, as they were
against the thread that produced the sqe's and now it's relativ to the unchanged sq_thread.
So unmodified application may corrupt/leak there data.

>> As we already ignore IORING_SETUP_ATTACH_WQ without IORING_SETUP_SQPOLL, what about
>> ignoring it as well if the attaching task uses different ->mm, ->files, ...
>> So IORING_SETUP_ATTACH_WQ would only have any effect if the task calling io_uring_setup()
>> runs in the same context (except of the creds) as the existing sq_thread, which means it would work
>> if multiple userspace threads of the same userspace process want to share the sq_thread and its
>> io_wq. Everything else would be stupid (similar to the unshare() cases).
>> But as this has worked before, we just silently ignore IORING_SETUP_ATTACH_WQ is
>> we find a context mismatch and let io_uring_setup() silently create a new sq_thread.
> 
> options:
> 1. Return back all that acquire_mm_files. Not great, not as safe
> as new io-wq, etc.
> 
> 2. Completely ignore SQPOLL sharing. Performance regressions...
> 
> 3. Do selected sharing. Maybe if thread group or so matches, should
> be safer than just mm/files check (or any subset of possibly long
> list). And there may be differences when the creator task do
> unshare/etc., but can be patched up (from where the unshare hook came
> in the first place).
> 
> I like 3) but 2) may do as well. The only performance problem I see
> is for those who wanted to use it out of threads. E.g. there even
> was a proposal to have per-CPU SQPOLL tasks and keep them per whole
> system.

Yes 2. with having a new IORING_SETUP_ATTACH_SQ (see my other mail)

Or 3. and I guess the thread group might be ok.
But somehow 2 feels safer and we could start with fresh ideas from there.

metze

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 11:46       ` IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle) Stefan Metzmacher
  2021-03-11 12:02         ` Stefan Metzmacher
  2021-03-11 12:27         ` Pavel Begunkov
@ 2021-03-11 15:27         ` Jens Axboe
  2 siblings, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2021-03-11 15:27 UTC (permalink / raw)
  To: Stefan Metzmacher, Pavel Begunkov, io-uring

On 3/11/21 4:46 AM, Stefan Metzmacher wrote:
> 
> Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
>> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>>
>>> Hi Pavel,
>>>
>>> I wondered about the exact same change this morning, while researching
>>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>>
>>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>>> As you introduced that flag, can you summaries it's behavior (and changes)
>>> over time (over the releases).
>>
>> Not sure I remember the story in details, but from the beginning it was
>> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
>> only about SQPOLL sharing, because of the recent io-wq changes that made
>> it per-task and shared by default.
>>
>> In all cases it should be checking the passed in file, that should retain
>> the old behaviour of failing setup if the flag is set but wq_fd is not valid.
> 
> Thanks, that's what I also found so far, see below for more findings.
> 
>>>
>>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
>>
>> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
>> all the resources of the creator (on the moment of io-wq creation). Off
>> ATTACH_WQ topic, but that's almost matches what it has been before, and
>> with dropped unshare bit, should be totally same.
>>
>> Regarding SQPOLL, it was always using resources of the first task, so
>> those are just reaped of from it, and not only some particular like
>> mm/files but all of them, like fork does, so should be safer.
>>
>> Creds are just a special case because of that personality stuff, at least
>> if we add back iowq unshare handling.
>>
>>>
>>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
>>
>> Have no clue.
>>
>>> As mm, files and other things may differ now between sqe producer and the sq_thread.
>>
>> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
>> but right, for the sharing case those may be different b/w ctx, so looks
>> like a regression to me
> 
> Good. I'll try to explore a possible way out below.
> 
> Ok, I'm continuing the thread here (just pasting the mail I already
> started to write :-)
> 
> I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.
> 
> The current logic in io_sq_offload_create() is this:
> 
> +       /* Retain compatibility with failing for an invalid attach attempt */
> +       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
> +                               IORING_SETUP_ATTACH_WQ) {
> +               struct fd f;
> +
> +               f = fdget(p->wq_fd);
> +               if (!f.file)
> +                       return -ENXIO;
> +               if (f.file->f_op != &io_uring_fops) {
> +                       fdput(f);
> +                       return -EINVAL;
> +               }
> +               fdput(f);
> +       }
> 
> That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL)
> is completely ignored (except that we still simulate the -ENXIO and
> -EINVAL  cases), correct? (You already agreed on that above :-)

Correct, there's no need to share anything there anymore. So we just
pretend that we do, since the application is none the wiser on that
front anyway.

> The reason for this is that io_wq is no longer maintained per
> io_ring_ctx, but instead it is now global per io_uring_task. Which
> means each userspace thread (or the sq_thread) has its own
> io_uring_task and thus its own io_wq.

Right

> Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still
> allow attaching to the sq_thread of a different io_ring_ctx. The
> sq_thread runs in the context of the io_uring_setup() syscall that
> created it. We used to switch current->mm, current->files and other
> things before calling __io_sq_thread() before, but we no longer do
> that. And this seems to be security problem to me, as it's now
> possible for the attached io_ring_ctx to start sqe's copying the whole
> address space of the donator into a registered fixed file of the
> attached process.

As Pavel commented, this isn't really a security issue, as you're
willingly passing the fd for someone to attach to. That's why I made it
an fd based attachment in the first place. That said, it can cause
breakage now where if you rely on your ring creator mm (etc) to be used,
then you'll have a bad time since things will fail. That's obviously not
ideal, and certainly needs fixing.

> As we already ignore IORING_SETUP_ATTACH_WQ without
> IORING_SETUP_SQPOLL, what about ignoring it as well if the attaching
> task uses different ->mm, ->files, ... So IORING_SETUP_ATTACH_WQ would
> only have any effect if the task calling io_uring_setup() runs in the
> same context (except of the creds) as the existing sq_thread, which
> means it would work if multiple userspace threads of the same
> userspace process want to share the sq_thread and its io_wq.
> Everything else would be stupid (similar to the unshare() cases). But
> as this has worked before, we just silently ignore
> IORING_SETUP_ATTACH_WQ is we find a context mismatch and let
> io_uring_setup() silently create a new sq_thread.
> 
> What do you think?

I think that's a very reasonable first approach, and would avoid a
direct regression in terms of having the thread now uses different
resources. It might very well cause a CPU regression since you now have
eg 2 threads instead of 1, but that's something we can tackle down the
line.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 12:02         ` Stefan Metzmacher
@ 2021-03-11 15:28           ` Jens Axboe
  0 siblings, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2021-03-11 15:28 UTC (permalink / raw)
  To: Stefan Metzmacher, Pavel Begunkov, io-uring

On 3/11/21 5:02 AM, Stefan Metzmacher wrote:
> Or we completely ignore IORING_SETUP_ATTACH_WQ (execpt the error
> cases).
> 
> Then we can implement a new IORING_SETUP_ATTACH_SQ with new semantics,
> that the existing sq_thread will be used as it and both sides now what
> it means to them. We also add a new
> IORING_REGISTER_RESTRICTIONS/IORING_RESTRICTION_ALLOW_SQ_ATTACHMENTS
> which prepares the first io_ring_ctx to allow others to attach.
> 
> Would that make sense?

I think we should retain ATTACH_WQ semantics with SQPOLL for the cases
that are possible. That would not exclude doing a more specific
ATTACH_SQ for the new case. And maybe those two cases can then be folded
down the line.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 12:44           ` Stefan Metzmacher
@ 2021-03-11 15:30             ` Jens Axboe
  2021-03-11 15:38               ` Jens Axboe
  0 siblings, 1 reply; 17+ messages in thread
From: Jens Axboe @ 2021-03-11 15:30 UTC (permalink / raw)
  To: Stefan Metzmacher, Pavel Begunkov, io-uring

On 3/11/21 5:44 AM, Stefan Metzmacher wrote:
> 
> Am 11.03.21 um 13:27 schrieb Pavel Begunkov:
>> On 11/03/2021 11:46, Stefan Metzmacher wrote:
>>> Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
>>>> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>>>>
>>>>> Hi Pavel,
>>>>>
>>>>> I wondered about the exact same change this morning, while researching
>>>>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>>>>
>>>>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>>>>> As you introduced that flag, can you summaries it's behavior (and changes)
>>>>> over time (over the releases).
>>>>
>>>> Not sure I remember the story in details, but from the beginning it was
>>>> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
>>>> only about SQPOLL sharing, because of the recent io-wq changes that made
>>>> it per-task and shared by default.
>>>>
>>>> In all cases it should be checking the passed in file, that should retain
>>>> the old behaviour of failing setup if the flag is set but wq_fd is not valid.
>>>
>>> Thanks, that's what I also found so far, see below for more findings.
>>>
>>>>>
>>>>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
>>>>
>>>> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
>>>> all the resources of the creator (on the moment of io-wq creation). Off
>>>> ATTACH_WQ topic, but that's almost matches what it has been before, and
>>>> with dropped unshare bit, should be totally same.
>>>>
>>>> Regarding SQPOLL, it was always using resources of the first task, so
>>>> those are just reaped of from it, and not only some particular like
>>>> mm/files but all of them, like fork does, so should be safer.
>>>>
>>>> Creds are just a special case because of that personality stuff, at least
>>>> if we add back iowq unshare handling.
>>>>
>>>>>
>>>>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
>>>>
>>>> Have no clue.
>>>>
>>>>> As mm, files and other things may differ now between sqe producer and the sq_thread.
>>>>
>>>> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
>>>> but right, for the sharing case those may be different b/w ctx, so looks
>>>> like a regression to me
>>>
>>> Good. I'll try to explore a possible way out below.
>>>
>>> Ok, I'm continuing the thread here (just pasting the mail I already started to write :-)
>>>
>>> I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.
>>>
>>> The current logic in io_sq_offload_create() is this:
>>>
>>> +       /* Retain compatibility with failing for an invalid attach attempt */
>>> +       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
>>> +                               IORING_SETUP_ATTACH_WQ) {
>>> +               struct fd f;
>>> +
>>> +               f = fdget(p->wq_fd);
>>> +               if (!f.file)
>>> +                       return -ENXIO;
>>> +               if (f.file->f_op != &io_uring_fops) {
>>> +                       fdput(f);
>>> +                       return -EINVAL;
>>> +               }
>>> +               fdput(f);
>>> +       }
>>>
>>> That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL) is completely
>>> ignored (except that we still simulate the -ENXIO and -EINVAL  cases), correct?
>>> (You already agreed on that above :-)
>>
>> Yep, and we do these -ENXIO and -EINVAL for SQPOLL as well.
>>  
>>> The reason for this is that io_wq is no longer maintained per io_ring_ctx,
>>> but instead it is now global per io_uring_task.
>>> Which means each userspace thread (or the sq_thread) has its own io_uring_task and
>>> thus its own io_wq.
>>
>> Just for anyone out of context, it's per process/thread/struct task/etc.
>> struct io_uring_task is just a bit of a context attached to a task ever submitted
>> io_uring requests, and its' not some special kind of a task.
>>
>>> Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still allow attaching
>>> to the sq_thread of a different io_ring_ctx. The sq_thread runs in the context of
>>> the io_uring_setup() syscall that created it. We used to switch current->mm, current->files
>>> and other things before calling __io_sq_thread() before, but we no longer do that.
>>> And this seems to be security problem to me, as it's now possible for the attached
>>> io_ring_ctx to start sqe's copying the whole address space of the donator into
>>> a registered fixed file of the attached process.
>>
>> It's not as bad, because 1) you voluntarily passes fd and 2) requires privileges,
>> but it's a change of behaviour, which, well, can be exploited as you said.
> 
> Yes, but pointers and other things may have a different meaning now, as they were
> against the thread that produced the sqe's and now it's relativ to the unchanged sq_thread.
> So unmodified application may corrupt/leak there data.
> 
>>> As we already ignore IORING_SETUP_ATTACH_WQ without IORING_SETUP_SQPOLL, what about
>>> ignoring it as well if the attaching task uses different ->mm, ->files, ...
>>> So IORING_SETUP_ATTACH_WQ would only have any effect if the task calling io_uring_setup()
>>> runs in the same context (except of the creds) as the existing sq_thread, which means it would work
>>> if multiple userspace threads of the same userspace process want to share the sq_thread and its
>>> io_wq. Everything else would be stupid (similar to the unshare() cases).
>>> But as this has worked before, we just silently ignore IORING_SETUP_ATTACH_WQ is
>>> we find a context mismatch and let io_uring_setup() silently create a new sq_thread.
>>
>> options:
>> 1. Return back all that acquire_mm_files. Not great, not as safe
>> as new io-wq, etc.
>>
>> 2. Completely ignore SQPOLL sharing. Performance regressions...
>>
>> 3. Do selected sharing. Maybe if thread group or so matches, should
>> be safer than just mm/files check (or any subset of possibly long
>> list). And there may be differences when the creator task do
>> unshare/etc., but can be patched up (from where the unshare hook came
>> in the first place).
>>
>> I like 3) but 2) may do as well. The only performance problem I see
>> is for those who wanted to use it out of threads. E.g. there even
>> was a proposal to have per-CPU SQPOLL tasks and keep them per whole
>> system.
> 
> Yes 2. with having a new IORING_SETUP_ATTACH_SQ (see my other mail)
> 
> Or 3. and I guess the thread group might be ok.
> But somehow 2 feels safer and we could start with fresh ideas from there.

3 should be perfectly safe, outside of someone doing unshare(). And I
think we already agreed that this case is in the realm of "garbage in,
garbage out" and we don't need to specifically cater to it. If we match
at attach time, we should be good to go.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 15:30             ` Jens Axboe
@ 2021-03-11 15:38               ` Jens Axboe
  2021-03-11 15:54                 ` Stefan Metzmacher
  0 siblings, 1 reply; 17+ messages in thread
From: Jens Axboe @ 2021-03-11 15:38 UTC (permalink / raw)
  To: Stefan Metzmacher, Pavel Begunkov, io-uring

On 3/11/21 8:30 AM, Jens Axboe wrote:
> On 3/11/21 5:44 AM, Stefan Metzmacher wrote:
>>
>> Am 11.03.21 um 13:27 schrieb Pavel Begunkov:
>>> On 11/03/2021 11:46, Stefan Metzmacher wrote:
>>>> Am 11.03.21 um 12:18 schrieb Pavel Begunkov:
>>>>> On 10/03/2021 13:56, Stefan Metzmacher wrote:
>>>>>>
>>>>>> Hi Pavel,
>>>>>>
>>>>>> I wondered about the exact same change this morning, while researching
>>>>>> the IORING_SETUP_ATTACH_WQ behavior :-)
>>>>>>
>>>>>> It still seems to me that IORING_SETUP_ATTACH_WQ changed over time.
>>>>>> As you introduced that flag, can you summaries it's behavior (and changes)
>>>>>> over time (over the releases).
>>>>>
>>>>> Not sure I remember the story in details, but from the beginning it was
>>>>> for io-wq sharing only, then it had expanded to SQPOLL as well. Now it's
>>>>> only about SQPOLL sharing, because of the recent io-wq changes that made
>>>>> it per-task and shared by default.
>>>>>
>>>>> In all cases it should be checking the passed in file, that should retain
>>>>> the old behaviour of failing setup if the flag is set but wq_fd is not valid.
>>>>
>>>> Thanks, that's what I also found so far, see below for more findings.
>>>>
>>>>>>
>>>>>> I'm wondering if ctx->sq_creds is really the only thing we need to take care of.
>>>>>
>>>>> io-wq is not affected by IORING_SETUP_ATTACH_WQ. It's per-task and mimics
>>>>> all the resources of the creator (on the moment of io-wq creation). Off
>>>>> ATTACH_WQ topic, but that's almost matches what it has been before, and
>>>>> with dropped unshare bit, should be totally same.
>>>>>
>>>>> Regarding SQPOLL, it was always using resources of the first task, so
>>>>> those are just reaped of from it, and not only some particular like
>>>>> mm/files but all of them, like fork does, so should be safer.
>>>>>
>>>>> Creds are just a special case because of that personality stuff, at least
>>>>> if we add back iowq unshare handling.
>>>>>
>>>>>>
>>>>>> Do we know about existing users of IORING_SETUP_ATTACH_WQ and their use case?
>>>>>
>>>>> Have no clue.
>>>>>
>>>>>> As mm, files and other things may differ now between sqe producer and the sq_thread.
>>>>>
>>>>> It was always using mm/files of the ctx creator's task, aka ctx->sqo_task,
>>>>> but right, for the sharing case those may be different b/w ctx, so looks
>>>>> like a regression to me
>>>>
>>>> Good. I'll try to explore a possible way out below.
>>>>
>>>> Ok, I'm continuing the thread here (just pasting the mail I already started to write :-)
>>>>
>>>> I did some more research regarding IORING_SETUP_ATTACH_WQ in 5.12.
>>>>
>>>> The current logic in io_sq_offload_create() is this:
>>>>
>>>> +       /* Retain compatibility with failing for an invalid attach attempt */
>>>> +       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
>>>> +                               IORING_SETUP_ATTACH_WQ) {
>>>> +               struct fd f;
>>>> +
>>>> +               f = fdget(p->wq_fd);
>>>> +               if (!f.file)
>>>> +                       return -ENXIO;
>>>> +               if (f.file->f_op != &io_uring_fops) {
>>>> +                       fdput(f);
>>>> +                       return -EINVAL;
>>>> +               }
>>>> +               fdput(f);
>>>> +       }
>>>>
>>>> That means that IORING_SETUP_ATTACH_WQ (without IORING_SETUP_SQPOLL) is completely
>>>> ignored (except that we still simulate the -ENXIO and -EINVAL  cases), correct?
>>>> (You already agreed on that above :-)
>>>
>>> Yep, and we do these -ENXIO and -EINVAL for SQPOLL as well.
>>>  
>>>> The reason for this is that io_wq is no longer maintained per io_ring_ctx,
>>>> but instead it is now global per io_uring_task.
>>>> Which means each userspace thread (or the sq_thread) has its own io_uring_task and
>>>> thus its own io_wq.
>>>
>>> Just for anyone out of context, it's per process/thread/struct task/etc.
>>> struct io_uring_task is just a bit of a context attached to a task ever submitted
>>> io_uring requests, and its' not some special kind of a task.
>>>
>>>> Regarding the IORING_SETUP_SQPOLL|IORING_SETUP_ATTACH_WQ case we still allow attaching
>>>> to the sq_thread of a different io_ring_ctx. The sq_thread runs in the context of
>>>> the io_uring_setup() syscall that created it. We used to switch current->mm, current->files
>>>> and other things before calling __io_sq_thread() before, but we no longer do that.
>>>> And this seems to be security problem to me, as it's now possible for the attached
>>>> io_ring_ctx to start sqe's copying the whole address space of the donator into
>>>> a registered fixed file of the attached process.
>>>
>>> It's not as bad, because 1) you voluntarily passes fd and 2) requires privileges,
>>> but it's a change of behaviour, which, well, can be exploited as you said.
>>
>> Yes, but pointers and other things may have a different meaning now, as they were
>> against the thread that produced the sqe's and now it's relativ to the unchanged sq_thread.
>> So unmodified application may corrupt/leak there data.
>>
>>>> As we already ignore IORING_SETUP_ATTACH_WQ without IORING_SETUP_SQPOLL, what about
>>>> ignoring it as well if the attaching task uses different ->mm, ->files, ...
>>>> So IORING_SETUP_ATTACH_WQ would only have any effect if the task calling io_uring_setup()
>>>> runs in the same context (except of the creds) as the existing sq_thread, which means it would work
>>>> if multiple userspace threads of the same userspace process want to share the sq_thread and its
>>>> io_wq. Everything else would be stupid (similar to the unshare() cases).
>>>> But as this has worked before, we just silently ignore IORING_SETUP_ATTACH_WQ is
>>>> we find a context mismatch and let io_uring_setup() silently create a new sq_thread.
>>>
>>> options:
>>> 1. Return back all that acquire_mm_files. Not great, not as safe
>>> as new io-wq, etc.
>>>
>>> 2. Completely ignore SQPOLL sharing. Performance regressions...
>>>
>>> 3. Do selected sharing. Maybe if thread group or so matches, should
>>> be safer than just mm/files check (or any subset of possibly long
>>> list). And there may be differences when the creator task do
>>> unshare/etc., but can be patched up (from where the unshare hook came
>>> in the first place).
>>>
>>> I like 3) but 2) may do as well. The only performance problem I see
>>> is for those who wanted to use it out of threads. E.g. there even
>>> was a proposal to have per-CPU SQPOLL tasks and keep them per whole
>>> system.
>>
>> Yes 2. with having a new IORING_SETUP_ATTACH_SQ (see my other mail)
>>
>> Or 3. and I guess the thread group might be ok.
>> But somehow 2 feels safer and we could start with fresh ideas from there.
> 
> 3 should be perfectly safe, outside of someone doing unshare(). And I
> think we already agreed that this case is in the realm of "garbage in,
> garbage out" and we don't need to specifically cater to it. If we match
> at attach time, we should be good to go.

Just to be clear, something like the below - totally untested, but
in principle.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6c62a3c95c1a..9a732b3b39fa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -269,6 +269,7 @@ struct io_sq_data {
 	unsigned		sq_thread_idle;
 	int			sq_cpu;
 	pid_t			task_pid;
+	pid_t			task_tgid;
 
 	unsigned long		state;
 	struct completion	startup;
@@ -7081,6 +7082,10 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
+	if (sqd->task_tgid != current->tgid) {
+		fdput(f);
+		return ERR_PTR(-EPERM);
+	}
 
 	refcount_inc(&sqd->refs);
 	fdput(f);
@@ -7091,8 +7096,14 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
 {
 	struct io_sq_data *sqd;
 
-	if (p->flags & IORING_SETUP_ATTACH_WQ)
-		return io_attach_sq_data(p);
+	if (p->flags & IORING_SETUP_ATTACH_WQ) {
+		sqd = io_attach_sq_data(p);
+		if (!IS_ERR(sqd))
+			return sqd;
+		/* fall through for EPERM case, setup new sqd/task */
+		if (PTR_ERR(sqd) != -EPERM)
+			return sqd;
+	}
 
 	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
 	if (!sqd)
@@ -7793,6 +7804,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		}
 
 		sqd->task_pid = current->pid;
+		sqd->task_tgid = current->tgid;
 		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
 		if (IS_ERR(tsk)) {
 			ret = PTR_ERR(tsk);

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle)
  2021-03-11 15:38               ` Jens Axboe
@ 2021-03-11 15:54                 ` Stefan Metzmacher
  0 siblings, 0 replies; 17+ messages in thread
From: Stefan Metzmacher @ 2021-03-11 15:54 UTC (permalink / raw)
  To: Jens Axboe, Pavel Begunkov, io-uring

Hi Jens,

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 6c62a3c95c1a..9a732b3b39fa 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -269,6 +269,7 @@ struct io_sq_data {
>  	unsigned		sq_thread_idle;
>  	int			sq_cpu;
>  	pid_t			task_pid;
> +	pid_t			task_tgid;
>  
>  	unsigned long		state;
>  	struct completion	startup;
> @@ -7081,6 +7082,10 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
>  		fdput(f);
>  		return ERR_PTR(-EINVAL);
>  	}
> +	if (sqd->task_tgid != current->tgid) {
> +		fdput(f);
> +		return ERR_PTR(-EPERM);
> +	}
>  
>  	refcount_inc(&sqd->refs);
>  	fdput(f);
> @@ -7091,8 +7096,14 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
>  {
>  	struct io_sq_data *sqd;
>  
> -	if (p->flags & IORING_SETUP_ATTACH_WQ)
> -		return io_attach_sq_data(p);
> +	if (p->flags & IORING_SETUP_ATTACH_WQ) {
> +		sqd = io_attach_sq_data(p);
> +		if (!IS_ERR(sqd))
> +			return sqd;
> +		/* fall through for EPERM case, setup new sqd/task */
> +		if (PTR_ERR(sqd) != -EPERM)
> +			return sqd;
> +	}
>  
>  	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
>  	if (!sqd)
> @@ -7793,6 +7804,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
>  		}
>  
>  		sqd->task_pid = current->pid;
> +		sqd->task_tgid = current->tgid;
>  		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
>  		if (IS_ERR(tsk)) {
>  			ret = PTR_ERR(tsk);
> 

Ok, that looks nice and simple, thanks!

metze

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2021-03-11 15:55 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-03-10 13:13 [PATCH 5.12 0/3] sqpoll fixes/cleanups Pavel Begunkov
2021-03-10 13:13 ` [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle Pavel Begunkov
2021-03-10 13:56   ` Stefan Metzmacher
2021-03-11 10:49     ` Stefan Metzmacher
2021-03-11 11:18     ` Pavel Begunkov
2021-03-11 11:46       ` IORING_SETUP_ATTACH_WQ (was Re: [PATCH 1/3] io_uring: fix invalid ctx->sq_thread_idle) Stefan Metzmacher
2021-03-11 12:02         ` Stefan Metzmacher
2021-03-11 15:28           ` Jens Axboe
2021-03-11 12:27         ` Pavel Begunkov
2021-03-11 12:44           ` Stefan Metzmacher
2021-03-11 15:30             ` Jens Axboe
2021-03-11 15:38               ` Jens Axboe
2021-03-11 15:54                 ` Stefan Metzmacher
2021-03-11 15:27         ` Jens Axboe
2021-03-10 13:13 ` [PATCH 2/3] io_uring: remove indirect ctx into sqo injection Pavel Begunkov
2021-03-10 13:13 ` [PATCH 3/3] io_uring: simplify io_sqd_update_thread_idle() Pavel Begunkov
2021-03-10 14:38 ` [PATCH 5.12 0/3] sqpoll fixes/cleanups Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox