public inbox for [email protected]
 help / color / mirror / Atom feed
From: Bijan Mottahedeh <[email protected]>
To: [email protected], [email protected], [email protected]
Subject: Re: [PATCH v5 13/13] io_uring: support buffer registration sharing
Date: Thu, 14 Jan 2021 13:17:23 -0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

On 1/13/2021 6:01 PM, Bijan Mottahedeh wrote:
> On 1/12/2021 1:33 PM, Bijan Mottahedeh wrote:
>> Implement buffer sharing among multiple rings.
>>
>> A ring shares its (future) buffer registrations at setup time with
>> IORING_SETUP_SHARE_BUF. A ring attaches to another ring's buffer
>> registration at setup time with IORING_SETUP_ATTACH_BUF, after
>> authenticating with the buffer registration owner's fd. Any updates to
>> the owner's buffer registrations become immediately available to the
>> attached rings.
>>
>> Signed-off-by: Bijan Mottahedeh <[email protected]>
>>
>> Conflicts:
>>     fs/io_uring.c
>> ---
>>   fs/io_uring.c                 | 85 
>> +++++++++++++++++++++++++++++++++++++++++--
>>   include/uapi/linux/io_uring.h |  2 +
>>   2 files changed, 83 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 37639b9..856a570b 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -8439,6 +8439,13 @@ static void io_buffers_map_free(struct 
>> io_ring_ctx *ctx)
>>       ctx->nr_user_bufs = 0;
>>   }
>> +static void io_detach_buf_data(struct io_ring_ctx *ctx)
>> +{
>> +    percpu_ref_put(&ctx->buf_data->refs);
>> +    ctx->buf_data = NULL;
>> +    ctx->nr_user_bufs = 0;
>> +}
>> +
>>   static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
>>   {
>>       struct fixed_rsrc_data *data = ctx->buf_data;
>> @@ -8447,6 +8454,11 @@ static int io_sqe_buffers_unregister(struct 
>> io_ring_ctx *ctx)
>>       if (!data)
>>           return -ENXIO;
>> +    if (ctx->flags & IORING_SETUP_ATTACH_BUF) {
>> +        io_detach_buf_data(ctx);
>> +        return 0;
>> +    }
>> +
>>       ret = io_rsrc_ref_quiesce(data, ctx, alloc_fixed_buf_ref_node);
>>       if (ret)
>>           return ret;
>> @@ -8690,9 +8702,13 @@ static struct fixed_rsrc_data 
>> *io_buffers_map_alloc(struct io_ring_ctx *ctx,
>>       if (!nr_args || nr_args > IORING_MAX_FIXED_BUFS)
>>           return ERR_PTR(-EINVAL);
>> -    buf_data = alloc_fixed_rsrc_data(ctx);
>> -    if (IS_ERR(buf_data))
>> -        return buf_data;
>> +    if (ctx->buf_data) {
>> +        buf_data = ctx->buf_data;
>> +    } else {
>> +        buf_data = alloc_fixed_rsrc_data(ctx);
>> +        if (IS_ERR(buf_data))
>> +            return buf_data;
>> +    }
>>       nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_BUFS_TABLE);
>>       buf_data->table = kcalloc(nr_tables, sizeof(*buf_data->table),
>> @@ -8757,9 +8773,17 @@ static int io_sqe_buffers_register(struct 
>> io_ring_ctx *ctx, void __user *arg,
>>       if (ctx->nr_user_bufs)
>>           return -EBUSY;
>> +    if (ctx->flags & IORING_SETUP_ATTACH_BUF) {
>> +        if (!ctx->buf_data)
>> +            return -EFAULT;
>> +        ctx->nr_user_bufs = ctx->buf_data->ctx->nr_user_bufs;
>> +        return 0;
>> +    }
>> +
>>       buf_data = io_buffers_map_alloc(ctx, nr_args);
>>       if (IS_ERR(buf_data))
>>           return PTR_ERR(buf_data);
>> +    ctx->buf_data = buf_data;
>>       for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
>>           struct io_mapped_ubuf *imu;
>> @@ -8783,7 +8807,6 @@ static int io_sqe_buffers_register(struct 
>> io_ring_ctx *ctx, void __user *arg,
>>               break;
>>       }
>> -    ctx->buf_data = buf_data;
>>       if (ret) {
>>           io_sqe_buffers_unregister(ctx);
>>           return ret;
>> @@ -9831,6 +9854,55 @@ static struct file *io_uring_get_file(struct 
>> io_ring_ctx *ctx)
>>       return file;
>>   }
>> +static int io_attach_buf_data(struct io_ring_ctx *ctx,
>> +                  struct io_uring_params *p)
>> +{
>> +    struct io_ring_ctx *ctx_attach;
>> +    struct fd f;
>> +
>> +    f = fdget(p->wq_fd);
>> +    if (!f.file)
>> +        return -EBADF;
>> +    if (f.file->f_op != &io_uring_fops) {
>> +        fdput(f);
>> +        return -EINVAL;
>> +    }
>> +
>> +    ctx_attach = f.file->private_data;
>> +    if (!ctx_attach->buf_data) {
>> +        fdput(f);
>> +        return -EINVAL;
>> +    }
>> +    ctx->buf_data = ctx_attach->buf_data;
>> +
>> +    percpu_ref_get(&ctx->buf_data->refs);
>> +    fdput(f);
>> +    return 0;
>> +}
>> +
>> +static int io_init_buf_data(struct io_ring_ctx *ctx, struct 
>> io_uring_params *p)
>> +{
>> +    if ((p->flags & (IORING_SETUP_SHARE_BUF | 
>> IORING_SETUP_ATTACH_BUF)) ==
>> +        (IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF))
>> +        return -EINVAL;
>> +
>> +    if (p->flags & IORING_SETUP_SHARE_BUF) {
>> +        struct fixed_rsrc_data *buf_data;
>> +
>> +        buf_data = alloc_fixed_rsrc_data(ctx);
>> +        if (IS_ERR(buf_data))
>> +            return PTR_ERR(buf_data);
>> +
>> +        ctx->buf_data = buf_data;
>> +        return 0;
>> +    }
>> +
>> +    if (p->flags & IORING_SETUP_ATTACH_BUF)
>> +        return io_attach_buf_data(ctx, p);
>> +
>> +    return 0;
>> +}
>> +
>>   static int io_uring_create(unsigned entries, struct io_uring_params *p,
>>                  struct io_uring_params __user *params)
>>   {
>> @@ -9948,6 +10020,10 @@ static int io_uring_create(unsigned entries, 
>> struct io_uring_params *p,
>>       if (ret)
>>           goto err;
>> +    ret = io_init_buf_data(ctx, p);
>> +    if (ret)
>> +        goto err;
>> +
>>       ret = io_sq_offload_create(ctx, p);
>>       if (ret)
>>           goto err;
>> @@ -10028,6 +10104,7 @@ static long io_uring_setup(u32 entries, struct 
>> io_uring_params __user *params)
>>       if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
>>               IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
>>               IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
>> +            IORING_SETUP_SHARE_BUF | IORING_SETUP_ATTACH_BUF |
>>               IORING_SETUP_R_DISABLED))
>>           return -EINVAL;
>> diff --git a/include/uapi/linux/io_uring.h 
>> b/include/uapi/linux/io_uring.h
>> index b289ef8..3ad786a 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -98,6 +98,8 @@ enum {
>>   #define IORING_SETUP_CLAMP    (1U << 4)    /* clamp SQ/CQ ring sizes */
>>   #define IORING_SETUP_ATTACH_WQ    (1U << 5)    /* attach to existing 
>> wq */
>>   #define IORING_SETUP_R_DISABLED    (1U << 6)    /* start with ring 
>> disabled */
>> +#define IORING_SETUP_SHARE_BUF    (1U << 7)    /* share buffer 
>> registration */
>> +#define IORING_SETUP_ATTACH_BUF    (1U << 8)    /* attach buffer 
>> registration */
>>   enum {
>>       IORING_OP_NOP,
>>
> 
> I recreated the deadlock scenario you had raised:
> 
>  > Ok, now the original io_uring instance will wait until the attached
>  > once get rid of their references. That's a versatile ground to have
>  > in kernel deadlocks.
>  >
>  > task1: uring1 = create()
>  > task2: uring2 = create()
>  > task1: uring3 = create(share=uring2);
>  > task2: uring4 = create(share=uring1);
>  >
>  > task1: io_sqe_buffers_unregister(uring1)
>  > task2: io_sqe_buffers_unregister(uring2)
>  >
>  > If I skimmed through the code right, that should hang unkillably.
> 
> with the following test:
> 
> +static int test_deadlock(void)
> +{
> +       int i, pid, ret;
> +       struct io_uring rings[4];
> +       struct io_uring_params p = {};
> +
> +       p.flags = IORING_SETUP_SHARE_BUF;
> +
> +       for (i = 0; i < 2; i++) {
> +               ret = io_uring_queue_init_params(1, &rings[i], &p);
> +               if (ret) {
> +                       verror("queue_init share");
> +                       return ret;
> +               }
> +       }
> +
> +       p.flags = IORING_SETUP_ATTACH_BUF;
> +
> +       pid = fork();
> +       if (pid) {
> +               p.wq_fd = rings[1].ring_fd;
> +               ret = io_uring_queue_init_params(1, &rings[3], &p);
                                                             ^^^
                                                             2
> +       } else {
> +               p.wq_fd = rings[0].ring_fd;
> +               ret = io_uring_queue_init_params(1, &rings[4], &p);
                                                             ^^^
                                                             3
> +       }
> +
> +       if (ret) {
> +               verror("queue_init attach");
> +               return ret;
> +       }
> +
> +
> +       vinfo(V1, "unregister\n");
> +
> +       if (pid) {
> +               close(rings[1].ring_fd);
> +               ret = io_uring_unregister_buffers(&rings[0]);
> +       } else {
> +               close(rings[0].ring_fd);
> +               ret = io_uring_unregister_buffers(&rings[1]);
> +       }
> +
> +       vinfo(V1, "unregister done\n");
> +
> +       if (ret)
> +               verror("unregister");
> +
> +       return ret;
> +}
> 
> 
> The two processe in the test hang but can be interrupted.
> 
> I checked that
> 
> ret = wait_for_completion_interruptible(&data->done);
> 
> in io_rsrc_ref_quiesce() returns -ERESTARTSYS (-512) after hitting ^C 
> and that
> 
> ret = io_run_task_work_sig();
> 
> returns -EINTR (-4)
> 
> Finally in
> 
> io_ring_ctx_free()
> -> io_sqe_buffers_unregister()
>     -> io_rsrc_ref_quiesce()
> 
> ret = wait_for_completion_interruptible(&data->done);
> 
> returns 0.
> 
> So it looks like the unkillable hang is not there.
> 
> However, when I take out the io_uring_unregister_buffers() calls from 
> the test, one of the processes gets a segfault with the following trace 
> and I'm not sure what the cause is.

Bug in test  itself.  Wrong index passed to io_uring_queue_init_params() 
above.


  reply	other threads:[~2021-01-14 21:18 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-12 21:33 [PATCH v5 00/13] io_uring: buffer registration enhancements Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 01/13] io_uring: rename file related variables to rsrc Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 02/13] io_uring: generalize io_queue_rsrc_removal Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 03/13] io_uring: separate ref_list from fixed_rsrc_data Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 04/13] io_uring: split alloc_fixed_file_ref_node Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 05/13] io_uring: add rsrc_ref locking routines Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 06/13] io_uring: implement fixed buffers registration similar to fixed files Bijan Mottahedeh
2021-01-15 17:41   ` Pavel Begunkov
2021-01-12 21:33 ` [PATCH v5 07/13] io_uring: create common fixed_rsrc_ref_node handling routines Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 08/13] io_uring: generalize files_update functionlity to rsrc_update Bijan Mottahedeh
2021-01-16 18:20   ` Pavel Begunkov
2021-01-12 21:33 ` [PATCH v5 09/13] io_uring: support buffer registration updates Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 10/13] io_uring: create common fixed_rsrc_data allocation routines Bijan Mottahedeh
2021-01-16 18:26   ` Pavel Begunkov
2021-01-12 21:33 ` [PATCH v5 11/13] io_uring: make percpu_ref_release names consistent Bijan Mottahedeh
2021-01-12 21:33 ` [PATCH v5 12/13] io_uring: call io_get_fixed_rsrc_ref for buffers Bijan Mottahedeh
2021-01-15 17:44   ` Pavel Begunkov
2021-01-12 21:33 ` [PATCH v5 13/13] io_uring: support buffer registration sharing Bijan Mottahedeh
2021-01-14  2:01   ` Bijan Mottahedeh
2021-01-14 21:17     ` Bijan Mottahedeh [this message]
2021-01-16 18:30   ` Pavel Begunkov
2021-01-14 21:20 ` [PATCH v5 00/13] io_uring: buffer registration enhancements Pavel Begunkov
2021-01-14 22:44   ` Bijan Mottahedeh
2021-01-14 22:54     ` Bijan Mottahedeh
2021-01-15  4:42       ` Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox