Jens Axboe <axboe@kernel.dk> writes:

> Currently any attempt to register a set of task restrictions if an
> existing set exists will fail with -EPERM. But it is feasible to let the
> original creator/owner performance this operation. Either to remove
> restrictions entirely, or to replace them with a new set.
>
> If an existing set exists and NULL is passed for the new set, the
> current set is unregistered. If an existing set exists and a new set is
> supplied, the old set is dropped and replaced with the new one.

Feature-wise, I think this covers what I mentioned in the previous
iteration.  Even though this is an RFC, I think I found two bugs that
allow the child to escape the restrictions:

> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  include/linux/io_uring_types.h |  1 +
>  io_uring/register.c            | 45 ++++++++++++++++++++++++++++------
>  2 files changed, 38 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 196f41ec6d60..1ff7817b3535 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -222,6 +222,7 @@ struct io_rings {
>  struct io_restriction {
>  	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
>  	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
> +	pid_t pid;
>  	refcount_t refs;
>  	u8 sqe_flags_allowed;
>  	u8 sqe_flags_required;
> diff --git a/io_uring/register.c b/io_uring/register.c
> index 552b22f6b2dc..c8b8a9edbc65 100644
> --- a/io_uring/register.c
> +++ b/io_uring/register.c
> @@ -189,12 +189,19 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
>  {
>  	struct io_uring_task_restriction __user *ures = arg;
>  	struct io_uring_task_restriction tres;
> -	struct io_restriction *res;
> +	struct io_restriction *old_res, *res;
>  	int ret;
>  
>  	if (nr_args != 1)
>  		return -EINVAL;
>  
> +	res = current->io_uring_restrict;
> +	if (!ures) {
> +		if (!res)
> +			return -EFAULT;
> +		goto drop_set;
> +	}
> +
>  	if (copy_from_user(&tres, arg, sizeof(tres)))
>  		return -EFAULT;
>  
> @@ -207,13 +214,27 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
>  	 * Disallow if task already has registered restrictions, and we're
>  	 * not passing in further restrictions to add to an existing set.
>  	 */
> -	if (current->io_uring_restrict &&
> -	    !(tres.flags & IORING_REG_RESTRICTIONS_MASK))
> -		return -EPERM;
> +	old_res = NULL;
> +	if (res && !(tres.flags & IORING_REG_RESTRICTIONS_MASK)) {
> +		/* Not owner, may only append further restrictions */
> +drop_set:
> +		if (res->pid != current->pid)
> +			return -EPERM;

This might be hard to exploit, but if the parent terminates, the pid can
get reused.  Then, if the child forks until it gets the same pid, it can
unregister the filter.  I suppose the fix would require holding a
reference to the task, similar to what pidfd does. but perhaps just
abandon the unregistering semantics?  I'm not sure it is that useful...

> @@ -226,14 +247,22 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
>  				    tres.flags & IORING_REG_RESTRICTIONS_MASK);
>  	if (ret) {
>  		kfree(res);
> -		return ret;
> +		goto out;
>  	}
>  	if (current->io_uring_restrict &&
>  	    refcount_dec_and_test(&current->io_uring_restrict->refs))
>  		kfree(current->io_uring_restrict);
> +	res->pid = current->pid;

res->pid must always point to the first task that added a
restriction. So:

if (!current->io_uring_restrict)
       res->pid = current->pid;

Otherwise, the child will become the owner after adding another
restriction, and can then break out with a further unregister.  Based on
your testcase, this escapes the filter: