From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-15.3 required=3.0 tests=BAYES_00, HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH, MAILING_LIST_MULTI,NICE_REPLY_A,SPF_HELO_NONE,SPF_PASS,UNPARSEABLE_RELAY, USER_AGENT_SANE_1 autolearn=ham autolearn_force=no version=3.4.0 Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) by smtp.lore.kernel.org (Postfix) with ESMTP id 6DCE3C433DB for ; Sun, 21 Feb 2021 09:47:11 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 40B7A64F03 for ; Sun, 21 Feb 2021 09:47:11 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229519AbhBUJrK (ORCPT ); Sun, 21 Feb 2021 04:47:10 -0500 Received: from out30-130.freemail.mail.aliyun.com ([115.124.30.130]:50280 "EHLO out30-130.freemail.mail.aliyun.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229932AbhBUJrK (ORCPT ); Sun, 21 Feb 2021 04:47:10 -0500 X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R971e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=alimailimapcm10staff010182156082;MF=haoxu@linux.alibaba.com;NM=1;PH=DS;RN=3;SR=0;TI=SMTPD_---0UP5aRg-_1613900784; Received: from B-25KNML85-0107.local(mailfrom:haoxu@linux.alibaba.com fp:SMTPD_---0UP5aRg-_1613900784) by smtp.aliyun-inc.com(127.0.0.1); Sun, 21 Feb 2021 17:46:24 +0800 Subject: Re: [PATCH 2/3] io_uring: fix io_rsrc_ref_quiesce races To: Pavel Begunkov , Jens Axboe , io-uring@vger.kernel.org References: <1b71f4059f088b035ec72307321f051a7be2d44f.1613767375.git.asml.silence@gmail.com> <070f2274-187d-8ee8-e841-f44beeba4fd0@gmail.com> From: Hao Xu Message-ID: Date: Sun, 21 Feb 2021 17:46:24 +0800 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Thunderbird/78.7.0 MIME-Version: 1.0 In-Reply-To: <070f2274-187d-8ee8-e841-f44beeba4fd0@gmail.com> Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 8bit Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org 在 2021/2/20 下午10:07, Pavel Begunkov 写道: > On 20/02/2021 06:31, Hao Xu wrote: >> 在 2021/2/20 上午4:45, Pavel Begunkov 写道: >>> There are different types of races in io_rsrc_ref_quiesce()  between >>> ->release() of percpu_refs and reinit_completion(), fix them by always >>> resurrecting between iterations. BTW, clean the function up, because >>> DRY. >>> >>> Fixes: 0ce4a72632317 ("io_uring: don't hold uring_lock when calling io_run_task_work*") >>> Signed-off-by: Pavel Begunkov >>> --- >>>   fs/io_uring.c | 46 +++++++++++++--------------------------------- >>>   1 file changed, 13 insertions(+), 33 deletions(-) >>> >>> diff --git a/fs/io_uring.c b/fs/io_uring.c >>> index 50d4dba08f82..38ed52065a29 100644 >>> --- a/fs/io_uring.c >>> +++ b/fs/io_uring.c >>> @@ -7316,19 +7316,6 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx, >>>       percpu_ref_get(&rsrc_data->refs); >>>   } >>>   -static int io_sqe_rsrc_add_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data) >>> -{ >>> -    struct fixed_rsrc_ref_node *backup_node; >>> - >>> -    backup_node = alloc_fixed_rsrc_ref_node(ctx); >>> -    if (!backup_node) >>> -        return -ENOMEM; >>> -    init_fixed_file_ref_node(ctx, backup_node); >>> -    io_sqe_rsrc_set_node(ctx, data, backup_node); >>> - >>> -    return 0; >>> -} >>> - >>>   static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data) >>>   { >>>       struct fixed_rsrc_ref_node *ref_node = NULL; >>> @@ -7347,36 +7334,29 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, >>>   { >>>       int ret; >>>   -    io_sqe_rsrc_kill_node(ctx, data); >>> -    percpu_ref_kill(&data->refs); >>> - >>> -    /* wait for all refs nodes to complete */ >>> -    flush_delayed_work(&ctx->rsrc_put_work); >>>       do { >>> +        io_sqe_rsrc_kill_node(ctx, data); >>> +        percpu_ref_kill(&data->refs); >>> +        flush_delayed_work(&ctx->rsrc_put_work); >>> + >>>           ret = wait_for_completion_interruptible(&data->done); >>>           if (!ret) >>>               break; >>>   -        ret = io_sqe_rsrc_add_node(ctx, data); >>> -        if (ret < 0) >>> -            break; >>> -        /* >>> -         * There is small possibility that data->done is already completed >>> -         * So reinit it here >>> -         */ >>> +        percpu_ref_resurrect(&data->refs); >> I've thought about this, if we resurrect data->refs, then we can't >> avoid parallel files unregister by percpu_refs_is_dying. > > Right, totally forgot about it, but otherwise we race with data->done. > Didn't test yet, but a diff below should do the trick. I'll test the latest version soon. > >>> +        if (ret < 0) >>> +            return ret; >>> +        backup_node = alloc_fixed_rsrc_ref_node(ctx); >>> +        if (!backup_node) >>> +            return -ENOMEM; >> Should we resurrect data->refs and reinit completion before >> signal or error return? > > Not sure what exactly you mean, we should not release uring_lock with > inconsistent state, so it's done right before unlock. And we should not > do it before wait_for_completion_interruptible() before it would take a > reference. Nevermind, I misread the code. > >>> +        init_fixed_file_ref_node(ctx, backup_node); >>> +    } while (1); >>>         destroy_fixed_rsrc_ref_node(backup_node); >>>       return 0; >>> >> > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index ce5fccf00367..0af1572634de 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -236,6 +236,7 @@ struct fixed_rsrc_data { > struct fixed_rsrc_ref_node *node; > struct percpu_ref refs; > struct completion done; > + bool quiesce; > }; > > struct io_buffer { > @@ -7335,6 +7336,7 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, > struct fixed_rsrc_ref_node *backup_node; > int ret; > > + data->quiesce = true; > do { > backup_node = alloc_fixed_rsrc_ref_node(ctx); > if (!backup_node) > @@ -7353,16 +7355,19 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, > percpu_ref_resurrect(&data->refs); > synchronize_rcu(); > io_sqe_rsrc_set_node(ctx, data, backup_node); > + backup_node = NULL; > reinit_completion(&data->done); > mutex_unlock(&ctx->uring_lock); > ret = io_run_task_work_sig(); > mutex_lock(&ctx->uring_lock); > if (ret < 0) > return ret; > - } while (1); > + } while (ret >= 0); > > - destroy_fixed_rsrc_ref_node(backup_node); > - return 0; > + data->quiesce = false; > + if (backup_node) > + destroy_fixed_rsrc_ref_node(backup_node); > + return ret; > } > > static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx) > @@ -7401,7 +7406,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) > * Since we possibly drop uring lock later in this function to > * run task work. > */ > - if (!data || percpu_ref_is_dying(&data->refs)) > + if (!data || data->quiesce) > return -ENXIO; > ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put); > if (ret) >