public inbox for [email protected]
 help / color / mirror / Atom feed
From: Jens Axboe <[email protected]>
To: Miklos Szeredi <[email protected]>
Cc: [email protected]
Subject: Re: io_uring_prep_openat_direct() and link/drain
Date: Wed, 30 Mar 2022 09:53:01 -0600	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

On 3/30/22 9:17 AM, Jens Axboe wrote:
> On 3/30/22 9:12 AM, Miklos Szeredi wrote:
>> On Wed, 30 Mar 2022 at 17:05, Jens Axboe <[email protected]> wrote:
>>>
>>> On 3/30/22 8:58 AM, Miklos Szeredi wrote:
>>>> Next issue:  seems like file slot reuse is not working correctly.
>>>> Attached program compares reads using io_uring with plain reads of
>>>> proc files.
>>>>
>>>> In the below example it is using two slots alternately but the number
>>>> of slots does not seem to matter, read is apparently always using a
>>>> stale file (the prior one to the most recent open on that slot).  See
>>>> how the sizes of the files lag by two lines:
>>>>
>>>> root@kvm:~# ./procreads
>>>> procreads: /proc/1/stat: ok (313)
>>>> procreads: /proc/2/stat: ok (149)
>>>> procreads: /proc/3/stat: read size mismatch 313/150
>>>> procreads: /proc/4/stat: read size mismatch 149/154
>>>> procreads: /proc/5/stat: read size mismatch 150/161
>>>> procreads: /proc/6/stat: read size mismatch 154/171
>>>> ...
>>>>
>>>> Any ideas?
>>>
>>> Didn't look at your code yet, but with the current tree, this is the
>>> behavior when a fixed file is used:
>>>
>>> At prep time, if the slot is valid it is used. If it isn't valid,
>>> assignment is deferred until the request is issued.
>>>
>>> Which granted is a bit weird. It means that if you do:
>>>
>>> <open fileA into slot 1, slot 1 currently unused><read slot 1>
>>>
>>> the read will read from fileA. But for:
>>>
>>> <open fileB into slot 1, slot 1 is fileA currently><read slot 1>
>>>
>>> since slot 1 is already valid at prep time for the read, the read will
>>> be from fileA again.
>>>
>>> Is this what you are seeing? It's definitely a bit confusing, and the
>>> only reason why I didn't change it is because it could potentially break
>>> applications. Don't think there's a high risk of that, however, so may
>>> indeed be worth it to just bite the bullet and the assignment is
>>> consistent (eg always done from the perspective of the previous
>>> dependent request having completed).
>>>
>>> Is this what you are seeing?
>>
>> Right, this explains it.   Then the only workaround would be to wait
>> for the open to finish before submitting the read, but that would
>> defeat the whole point of using io_uring for this purpose.
> 
> Honestly, I think we should just change it during this round, making it
> consistent with the "slot is unused" use case. The old use case is more
> more of a "it happened to work" vs the newer consistent behavior of "we
> always assign the file when execution starts on the request".
> 
> Let me spin a patch, would be great if you could test.

Something like this on top of the current tree should work. Can you
test?

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4d4ca8e115f6..d36475cefb8c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -785,7 +785,6 @@ enum {
 	REQ_F_SINGLE_POLL_BIT,
 	REQ_F_DOUBLE_POLL_BIT,
 	REQ_F_PARTIAL_IO_BIT,
-	REQ_F_DEFERRED_FILE_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -850,8 +849,6 @@ enum {
 	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
 	/* request has already done partial IO */
 	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
-	/* request has file assignment deferred */
-	REQ_F_DEFERRED_FILE	= BIT(REQ_F_DEFERRED_FILE_BIT),
 };
 
 struct async_poll {
@@ -918,7 +915,11 @@ struct io_kiocb {
 	unsigned int			flags;
 
 	u64				user_data;
-	u32				result;
+	/* fd before execution, if valid, result after execution */
+	union {
+		u32			result;
+		s32			fd;
+	};
 	u32				cflags;
 
 	struct io_ring_ctx		*ctx;
@@ -1767,20 +1768,6 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
 	}
 }
 
-static void io_assign_file(struct io_kiocb *req)
-{
-	if (req->file || !io_op_defs[req->opcode].needs_file)
-		return;
-	if (req->flags & REQ_F_DEFERRED_FILE) {
-		req->flags &= ~REQ_F_DEFERRED_FILE;
-		req->file = io_file_get(req->ctx, req, req->result,
-					req->flags & REQ_F_FIXED_FILE);
-		req->result = 0;
-	}
-	if (unlikely(!req->file))
-		req_set_fail(req);
-}
-
 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	while (!list_empty(&ctx->defer_list)) {
@@ -1790,7 +1777,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 		if (req_need_defer(de->req, de->seq))
 			break;
 		list_del_init(&de->list);
-		io_assign_file(de->req);
 		io_req_task_queue(de->req);
 		kfree(de);
 	}
@@ -2131,7 +2117,6 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
 			if (req->flags & IO_DISARM_MASK)
 				io_disarm_next(req);
 			if (req->link) {
-				io_assign_file(req->link);
 				io_req_task_queue(req->link);
 				req->link = NULL;
 			}
@@ -2443,11 +2428,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 		__io_req_find_next_prep(req);
 	nxt = req->link;
 	req->link = NULL;
-	if (nxt) {
-		io_assign_file(nxt);
-		return nxt;
-	}
-	return NULL;
+	return nxt;
 }
 
 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
@@ -2650,10 +2631,6 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 
 static void io_req_task_queue(struct io_kiocb *req)
 {
-	if (unlikely(req->flags & REQ_F_FAIL)) {
-		io_req_task_queue_fail(req, -ECANCELED);
-		return;
-	}
 	req->io_task_work.func = io_req_task_submit;
 	io_req_task_work_add(req, false);
 }
@@ -2668,10 +2645,8 @@ static inline void io_queue_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = io_req_find_next(req);
 
-	if (nxt) {
-		io_assign_file(req);
+	if (nxt)
 		io_req_task_queue(nxt);
-	}
 }
 
 static void io_free_req(struct io_kiocb *req)
@@ -4545,9 +4520,6 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!req->file)
-		return -EBADF;
-
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
@@ -7150,7 +7122,6 @@ static __cold void io_drain_req(struct io_kiocb *req)
 		spin_unlock(&ctx->completion_lock);
 queue:
 		ctx->drain_active = false;
-		io_assign_file(req);
 		io_req_task_queue(req);
 		return;
 	}
@@ -7259,6 +7230,23 @@ static void io_clean_op(struct io_kiocb *req)
 	req->flags &= ~IO_REQ_CLEAN_FLAGS;
 }
 
+static bool io_assign_file(struct io_kiocb *req)
+{
+	if (req->file || !io_op_defs[req->opcode].needs_file)
+		return true;
+
+	req->file = io_file_get(req->ctx, req, req->fd,
+					req->flags & REQ_F_FIXED_FILE);
+	if (unlikely(!req->file)) {
+		req_set_fail(req);
+		req->result = -EBADF;
+		return false;
+	}
+
+	req->result = 0;
+	return true;
+}
+
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 {
 	const struct cred *creds = NULL;
@@ -7269,6 +7257,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (!io_op_defs[req->opcode].audit_skip)
 		audit_uring_entry(req->opcode);
+	if (unlikely(!io_assign_file(req)))
+		return -EBADF;
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:
@@ -7429,8 +7419,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 	if (timeout)
 		io_queue_linked_timeout(timeout);
 
-	io_assign_file(req);
-	if (unlikely(!req->file && def->needs_file)) {
+	if (!io_assign_file(req)) {
 		work->flags |= IO_WQ_WORK_CANCEL;
 		err = -EBADF;
 	}
@@ -7732,12 +7721,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	int personality;
 	u8 opcode;
 
+	req->file = NULL;
 	/* req is partially pre-initialised, see io_preinit_req() */
 	req->opcode = opcode = READ_ONCE(sqe->opcode);
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags = sqe_flags = READ_ONCE(sqe->flags);
 	req->user_data = READ_ONCE(sqe->user_data);
-	req->file = NULL;
 	req->fixed_rsrc_refs = NULL;
 	req->task = current;
 
@@ -7776,7 +7765,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	if (io_op_defs[opcode].needs_file) {
 		struct io_submit_state *state = &ctx->submit_state;
-		int fd = READ_ONCE(sqe->fd);
+
+		req->fd = READ_ONCE(sqe->fd);
 
 		/*
 		 * Plug now if we have more than 2 IO left after this, and the
@@ -7787,17 +7777,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			state->need_plug = false;
 			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
 		}
-
-		req->file = io_file_get(ctx, req, fd,
-					(sqe_flags & IOSQE_FIXED_FILE));
-		if (unlikely(!req->file)) {
-			/* unless being deferred, error is final */
-			if (!(ctx->submit_state.link.head ||
-			     (sqe_flags & IOSQE_IO_DRAIN)))
-				return -EBADF;
-			req->result = fd;
-			req->flags |= REQ_F_DEFERRED_FILE;
-		}
 	}
 
 	personality = READ_ONCE(sqe->personality);

-- 
Jens Axboe


  reply	other threads:[~2022-03-30 15:53 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-29 13:20 io_uring_prep_openat_direct() and link/drain Miklos Szeredi
2022-03-29 16:08 ` Jens Axboe
2022-03-29 17:04   ` Jens Axboe
2022-03-29 18:21     ` Miklos Szeredi
2022-03-29 18:26       ` Jens Axboe
2022-03-29 18:31         ` Miklos Szeredi
2022-03-29 18:40           ` Jens Axboe
2022-03-29 19:30             ` Miklos Szeredi
2022-03-29 20:03               ` Jens Axboe
2022-03-30  8:18                 ` Miklos Szeredi
2022-03-30 12:35                   ` Jens Axboe
2022-03-30 12:43                     ` Miklos Szeredi
2022-03-30 12:48                       ` Jens Axboe
2022-03-30 12:51                         ` Miklos Szeredi
2022-03-30 14:58                           ` Miklos Szeredi
2022-03-30 15:05                             ` Jens Axboe
2022-03-30 15:12                               ` Miklos Szeredi
2022-03-30 15:17                                 ` Jens Axboe
2022-03-30 15:53                                   ` Jens Axboe [this message]
2022-03-30 17:49                                     ` Jens Axboe
2022-04-01  8:40                                       ` Miklos Szeredi
2022-04-01 15:36                                         ` Jens Axboe
2022-04-01 16:02                                           ` Miklos Szeredi
2022-04-01 16:21                                             ` Jens Axboe
2022-04-02  1:17                                               ` Jens Axboe
2022-04-05  7:45                                                 ` Miklos Szeredi
2022-04-05 14:44                                                   ` Jens Axboe
2022-04-21 12:31                                                     ` Miklos Szeredi
2022-04-21 12:34                                                       ` Jens Axboe
2022-04-21 12:39                                                         ` Miklos Szeredi
2022-04-21 12:41                                                           ` Jens Axboe
2022-04-21 13:10                                                             ` Miklos Szeredi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox