From: Jens Axboe <[email protected]>
To: Dylan Yudaken <[email protected]>,
"[email protected]" <[email protected]>,
"[email protected]" <[email protected]>
Subject: Re: [PATCH 0/3] io_uring: consistent behaviour with linked read/write
Date: Fri, 18 Feb 2022 10:25:05 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
On 2/18/22 10:20 AM, Dylan Yudaken wrote:
> On Thu, 2022-02-17 at 12:45 -0700, Jens Axboe wrote:
>> On 2/17/22 8:58 AM, Dylan Yudaken wrote:
>>> Currently submitting multiple read/write for one file with
>>> IOSQE_IO_LINK
>>> and offset = -1 will not behave as if calling read(2)/write(2)
>>> multiple
>>> times. The offset may be pinned to the same value for each
>>> submission (for
>>> example if they are punted to the async worker) and so each
>>> read/write will
>>> have the same offset.
>>>
>>> This patchset fixes this by grabbing the file position at execution
>>> time,
>>> rather than when the job is queued to be run.
>>>
>>> A test for this will be submitted to liburing separately.
>>>
>>> Worth noting that this does not purposefully change the result of
>>> submitting multiple read/write without IOSQE_IO_LINK (for example
>>> as in
>>> [1]). But then I do not know what the correct approach should be
>>> when
>>> submitting multiple r/w without any explicit ordering.
>>>
>>> [1]:
>>> https://lore.kernel.org/io-uring/[email protected]/
>>
>> I think this series looks great, clean and to the point. My only real
>> question is one you reference here already, which is the fpos locking
>> that we really should get done. Care to respin the referenced patch
>> on
>> top of this series? Would hate to make that part harder...
>>
>
> Sure, I will try and figure that out and add it to the series.
Just for full public disclosure, the later version of the above patch is
below.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fb2a0cb4aaf8..ab8a4002e0eb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -726,6 +726,7 @@ enum {
REQ_F_FAIL_BIT = 8,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
+ REQ_F_CUR_POS_SET_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_NEED_CLEANUP_BIT,
@@ -765,6 +766,8 @@ enum {
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
+ /* request is holding file position lock */
+ REQ_F_CUR_POS_SET = BIT(REQ_F_CUR_POS_SET_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
@@ -2070,6 +2073,8 @@ static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ WARN_ON_ONCE(req->flags & REQ_F_CUR_POS_SET);
+
io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
@@ -2715,6 +2720,8 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
req_set_fail(req);
req->result = res;
}
+ if (req->flags & REQ_F_CUR_POS && res > 0)
+ req->file->f_pos += res;
return false;
}
@@ -2892,12 +2899,15 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1) {
- if (!(file->f_mode & FMODE_STREAM)) {
+ /*
+ * If we end up using the current file position, just set the
+ * flag and the actual file position will be read in the op
+ * handler themselves.
+ */
+ if (!(file->f_mode & FMODE_STREAM))
req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- } else {
+ else
kiocb->ki_pos = 0;
- }
}
kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
@@ -2979,8 +2989,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
ret += io->bytes_done;
}
- if (req->flags & REQ_F_CUR_POS)
- req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
__io_complete_rw(req, ret, 0, issue_flags);
else
@@ -3515,6 +3523,37 @@ static bool need_read_all(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}
+static bool io_file_pos_lock(struct io_kiocb *req, bool force_nonblock)
+{
+ struct file *file = req->file;
+
+ if (!(req->flags & REQ_F_CUR_POS))
+ return false;
+ WARN_ON_ONCE(req->flags & REQ_F_CUR_POS_SET);
+ if (file->f_mode & FMODE_ATOMIC_POS && file_count(file) > 1) {
+ if (force_nonblock) {
+ if (!mutex_trylock(&file->f_pos_lock))
+ return true;
+ } else {
+ mutex_lock(&file->f_pos_lock);
+ }
+ req->flags |= REQ_F_CUR_POS_SET;
+ }
+ req->rw.kiocb.ki_pos = file->f_pos;
+ file->f_pos += req->result;
+ return false;
+}
+
+static void io_file_pos_unlock(struct io_kiocb *req, ssize_t adjust)
+{
+ if (!(req->flags & REQ_F_CUR_POS_SET))
+ return;
+ if (adjust > 0)
+ req->file->f_pos -= adjust;
+ req->flags &= ~REQ_F_CUR_POS_SET;
+ __f_unlock_pos(req->file);
+}
+
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw_state __s, *s = &__s;
@@ -3543,18 +3582,22 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req))) {
+ if (unlikely(!io_file_supports_nowait(req) ||
+ io_file_pos_lock(req, true))) {
ret = io_setup_async_rw(req, iovec, s, true);
+ WARN_ON_ONCE(req->flags & REQ_F_CUR_POS_SET);
return ret ?: -EAGAIN;
}
kiocb->ki_flags |= IOCB_NOWAIT;
} else {
/* Ensure we clear previously set non-block flag */
kiocb->ki_flags &= ~IOCB_NOWAIT;
+ io_file_pos_lock(req, false);
}
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret)) {
+ io_file_pos_unlock(req, req->result);
kfree(iovec);
return ret;
}
@@ -3565,10 +3608,10 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_REISSUE;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
- goto done;
+ goto out_unlock;
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
- goto done;
+ goto out_unlock;
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
@@ -3586,8 +3629,10 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
iov_iter_restore(&s->iter, &s->iter_state);
ret2 = io_setup_async_rw(req, iovec, s, true);
- if (ret2)
+ if (ret2) {
+ io_file_pos_unlock(req, ret);
return ret2;
+ }
iovec = NULL;
rw = req->async_data;
@@ -3612,6 +3657,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
+ io_file_pos_unlock(req, req->result - rw->bytes_done);
return -EAGAIN;
}
@@ -3622,19 +3668,26 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* here, and if we do, then just retry at the new offset.
*/
ret = io_iter_do_read(req, &s->iter);
- if (ret == -EIOCBQUEUED)
+ if (ret == -EIOCBQUEUED) {
+ io_file_pos_unlock(req, ret);
return 0;
+ }
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
iov_iter_restore(&s->iter, &s->iter_state);
} while (ret > 0);
+ io_file_pos_unlock(req, req->result - rw->bytes_done);
done:
kiocb_done(kiocb, ret, issue_flags);
out_free:
/* it's faster to check here then delegate to kfree */
if (iovec)
kfree(iovec);
+ io_file_pos_unlock(req, 0);
return 0;
+out_unlock:
+ io_file_pos_unlock(req, ret);
+ goto done;
}
static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3668,7 +3721,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req)))
+ if (unlikely(!io_file_supports_nowait(req) ||
+ io_file_pos_lock(req, true)))
goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */
@@ -3680,6 +3734,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
/* Ensure we clear previously set non-block flag */
kiocb->ki_flags &= ~IOCB_NOWAIT;
+ io_file_pos_lock(req, false);
}
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
--
Jens Axboe
prev parent reply other threads:[~2022-02-18 17:25 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-02-17 15:58 [PATCH 0/3] io_uring: consistent behaviour with linked read/write Dylan Yudaken
2022-02-17 15:58 ` [PATCH 1/3] io_uring: remove duplicated calls to io_kiocb_ppos Dylan Yudaken
2022-02-17 15:58 ` [PATCH 2/3] io_uring: update kiocb->ki_pos at execution time Dylan Yudaken
2022-02-17 15:58 ` [PATCH 3/3] io_uring: do not recalculate ppos unnecessarily Dylan Yudaken
2022-02-17 19:45 ` [PATCH 0/3] io_uring: consistent behaviour with linked read/write Jens Axboe
2022-02-18 17:20 ` Dylan Yudaken
2022-02-18 17:25 ` Jens Axboe [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox