From: Pavel Begunkov <[email protected]>
To: Jens Axboe <[email protected]>, [email protected]
Cc: [email protected], 李通洲 <[email protected]>
Subject: Re: [PATCH 1/4] io_uring: allow unbreakable links
Date: Tue, 10 Dec 2019 13:13:33 +0300 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 12/10/2019 2:18 AM, Jens Axboe wrote:
> Some commands will invariably end in a failure in the sense that the
> completion result will be less than zero. One such example is timeouts
> that don't have a completion count set, they will always complete with
> -ETIME unless cancelled.
>
> For linked commands, we sever links and fail the rest of the chain if
> the result is less than zero. Since we have commands where we know that
> will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
> regardless of the completion result. Note that the link will still sever
> if we fail submitting the parent request, hard links are only resilient
> in the presence of completion results for requests that did submit
> correctly.
>
> Cc: [email protected] # v5.4
> Reported-by: 李通洲 <[email protected]>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> fs/io_uring.c | 54 +++++++++++++++++++++--------------
> include/uapi/linux/io_uring.h | 1 +
> 2 files changed, 33 insertions(+), 22 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 405be10da73d..662404854571 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -377,6 +377,7 @@ struct io_kiocb {
> #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
> #define REQ_F_INFLIGHT 16384 /* on inflight list */
> #define REQ_F_COMP_LOCKED 32768 /* completion under lock */
> +#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
> u64 user_data;
> u32 result;
> u32 sequence;
> @@ -941,7 +942,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
>
> list_del_init(&req->link_list);
> if (!list_empty(&nxt->link_list))
> - nxt->flags |= REQ_F_LINK;
> + nxt->flags |= req->flags & (REQ_F_LINK|REQ_F_HARDLINK);
I'm not sure we want to unconditionally propagate REQ_F_HARDLINK further.
E.g. timeout|REQ_F_HARDLINK -> read -> write
REQ_F_HARDLINK will be set to the following read and its fail won't
cancel the write, that seems strange. If users want such behaviour, they
can set REQ_F_HARDLINK when needed by hand.
> *nxtptr = nxt;
> break;
> }
> @@ -1292,6 +1293,11 @@ static void kiocb_end_write(struct io_kiocb *req)
> file_end_write(req->file);
> }
>
> +static inline bool req_fail_links(struct io_kiocb *req)
> +{
> + return (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK;
> +}
> +
req_fail_links() sounds like it not only do checking, but actually fails
links. How about as follows?
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+}
And it would be less code below
> static void io_complete_rw_common(struct kiocb *kiocb, long res)
> {
> struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
> @@ -1299,7 +1305,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
> if (kiocb->ki_flags & IOCB_WRITE)
> kiocb_end_write(req);
>
> - if ((req->flags & REQ_F_LINK) && res != req->result)
> + if (res != req->result && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_add_event(req, res);
> }
> @@ -1330,7 +1336,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
> if (kiocb->ki_flags & IOCB_WRITE)
> kiocb_end_write(req);
>
> - if ((req->flags & REQ_F_LINK) && res != req->result)
> + if (res != req->result && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> req->result = res;
> if (res != -EAGAIN)
> @@ -1956,7 +1962,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> end > 0 ? end : LLONG_MAX,
> fsync_flags & IORING_FSYNC_DATASYNC);
>
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_add_event(req, ret);
> io_put_req_find_next(req, nxt);
> @@ -2003,7 +2009,7 @@ static int io_sync_file_range(struct io_kiocb *req,
>
> ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
>
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_add_event(req, ret);
> io_put_req_find_next(req, nxt);
> @@ -2079,7 +2085,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>
> out:
> io_cqring_add_event(req, ret);
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req_find_next(req, nxt);
> return 0;
> @@ -2161,7 +2167,7 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>
> out:
> io_cqring_add_event(req, ret);
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req_find_next(req, nxt);
> return 0;
> @@ -2196,7 +2202,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> }
> if (ret == -ERESTARTSYS)
> ret = -EINTR;
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_add_event(req, ret);
> io_put_req_find_next(req, nxt);
> @@ -2263,7 +2269,7 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> if (ret == -ERESTARTSYS)
> ret = -EINTR;
> out:
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_add_event(req, ret);
> io_put_req_find_next(req, nxt);
> @@ -2340,7 +2346,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> spin_unlock_irq(&ctx->completion_lock);
>
> io_cqring_add_event(req, ret);
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req(req);
> return 0;
> @@ -2399,7 +2405,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
>
> io_cqring_ev_posted(ctx);
>
> - if (ret < 0 && req->flags & REQ_F_LINK)
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req_find_next(req, &nxt);
> if (nxt)
> @@ -2582,7 +2588,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
> spin_unlock_irqrestore(&ctx->completion_lock, flags);
>
> io_cqring_ev_posted(ctx);
> - if (req->flags & REQ_F_LINK)
> + if (req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;> io_put_req(req);
> return HRTIMER_NORESTART;
> @@ -2608,7 +2614,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
> if (ret == -1)
> return -EALREADY;
>
> - if (req->flags & REQ_F_LINK)
> + if (req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_fill_event(req, -ECANCELED);
> io_put_req(req);
> @@ -2640,7 +2646,7 @@ static int io_timeout_remove(struct io_kiocb *req,
> io_commit_cqring(ctx);
> spin_unlock_irq(&ctx->completion_lock);
> io_cqring_ev_posted(ctx);
> - if (ret < 0 && req->flags & REQ_F_LINK)
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req(req);
> return 0;
> @@ -2822,7 +2828,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
> spin_unlock_irqrestore(&ctx->completion_lock, flags);
> io_cqring_ev_posted(ctx);
>
> - if (ret < 0 && (req->flags & REQ_F_LINK))
> + if (ret < 0 && req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req_find_next(req, nxt);
> }
> @@ -3044,7 +3050,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
> io_put_req(req);
>
> if (ret) {
> - if (req->flags & REQ_F_LINK)
> + if (req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_cqring_add_event(req, ret);
> io_put_req(req);
> @@ -3179,7 +3185,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
> spin_unlock_irqrestore(&ctx->completion_lock, flags);
>
> if (prev) {
> - if (prev->flags & REQ_F_LINK)
> + if (req_fail_links(prev))
> prev->flags |= REQ_F_FAIL_LINK;
> io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
> -ETIME);
> @@ -3273,7 +3279,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
> /* and drop final reference, if we failed */
> if (ret) {
> io_cqring_add_event(req, ret);
> - if (req->flags & REQ_F_LINK)
> + if (req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_put_req(req);
> }
> @@ -3293,7 +3299,7 @@ static void io_queue_sqe(struct io_kiocb *req)
> if (ret) {
> if (ret != -EIOCBQUEUED) {
> io_cqring_add_event(req, ret);
> - if (req->flags & REQ_F_LINK)
> + if (req_fail_links(req))
> req->flags |= REQ_F_FAIL_LINK;
> io_double_put_req(req);
> }
> @@ -3311,7 +3317,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
> }
>
>
> -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
> +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
> + IOSQE_IO_HARDLINK)
>
> static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
> struct io_kiocb **link)
> @@ -3358,13 +3365,16 @@ static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
> ret = io_req_defer_prep(req, io);
> if (ret) {
> kfree(io);
> + /* fail even hard links since we don't submit */
> prev->flags |= REQ_F_FAIL_LINK;
> goto err_req;
> }
> trace_io_uring_link(ctx, req, prev);
> list_add_tail(&req->link_list, &prev->link_list);
> - } else if (req->sqe->flags & IOSQE_IO_LINK) {
> + } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
> req->flags |= REQ_F_LINK;
> + if (req->sqe->flags & IOSQE_IO_HARDLINK)
> + req->flags |= REQ_F_HARDLINK;
>
> INIT_LIST_HEAD(&req->link_list);
> *link = req;
> @@ -3518,7 +3528,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
> * If previous wasn't linked and we have a linked command,
> * that's the end of the chain. Submit the previous link.
> */
> - if (!(sqe_flags & IOSQE_IO_LINK) && link) {
> + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
IMHO, requests shouldn't have IOSQE_IO_HARDLINK without IOSQE_IO_LINK,
the same is in the code. Then, it's sufficient check only IOSQE_IO_LINK.
> io_queue_link_head(link);
> link = NULL;
> }
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index eabccb46edd1..f296a5e77661 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -48,6 +48,7 @@ struct io_uring_sqe {
> #define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
> #define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
> #define IOSQE_IO_LINK (1U << 2) /* links next sqe */
> +#define IOSQE_IO_HARDLINK (1U << 3) /* link LINK, but stronger */
>
> /*
> * io_uring_setup() flags
>
--
Pavel Begunkov
next prev parent reply other threads:[~2019-12-10 10:13 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-12-09 23:18 [PATCHSET 0/4] io_uring patches for 5.5-rc Jens Axboe
2019-12-09 23:18 ` [PATCH 1/4] io_uring: allow unbreakable links Jens Axboe
2019-12-10 10:13 ` Pavel Begunkov [this message]
2019-12-10 15:13 ` Jens Axboe
2019-12-09 23:18 ` [PATCH 2/4] io-wq: remove worker->wait waitqueue Jens Axboe
2019-12-09 23:18 ` [PATCH 3/4] io-wq: briefly spin for new work after finishing work Jens Axboe
2019-12-09 23:18 ` [PATCH 4/4] io_uring: sqthread should grab ctx->uring_lock for submissions Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox