* [PATCHSET 0/2] io_uring: handle short reads internally @ 2020-08-13 17:56 Jens Axboe 2020-08-13 17:56 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe ` (2 more replies) 0 siblings, 3 replies; 18+ messages in thread From: Jens Axboe @ 2020-08-13 17:56 UTC (permalink / raw) To: io-uring; +Cc: david, jmoyer Since we've had a few cases of applications not dealing with this appopriately, I believe the safest course of action is to ensure that we don't return short reads when we really don't have to. The first patch is just a prep patch that retains iov_iter state over retries, while the second one actually enables just doing retries if we get a short read back. -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls 2020-08-13 17:56 [PATCHSET 0/2] io_uring: handle short reads internally Jens Axboe @ 2020-08-13 17:56 ` Jens Axboe 2020-08-13 17:56 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe 2020-08-13 20:25 ` [PATCHSET 0/2] io_uring: handle short reads internally Jeff Moyer 2 siblings, 0 replies; 18+ messages in thread From: Jens Axboe @ 2020-08-13 17:56 UTC (permalink / raw) To: io-uring; +Cc: david, jmoyer, Jens Axboe Instead of maintaining (and setting/remembering) iov_iter size and segment counts, just put the iov_iter in the async part of the IO structure. This is mostly a preparation patch for doing appropriate internal retries for short reads, but it also cleans up the state handling nicely and simplifies it quite a bit. Signed-off-by: Jens Axboe <[email protected]> --- fs/io_uring.c | 118 ++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1ec25ee71372..a20fccf91d76 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -508,9 +508,7 @@ struct io_async_msghdr { struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; - struct iovec *iov; - ssize_t nr_segs; - ssize_t size; + struct iov_iter iter; struct wait_page_queue wpq; }; @@ -915,9 +913,8 @@ static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock); -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter); +static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec, + struct iovec *fast_iov, struct iov_iter *iter); static struct kmem_cache *req_cachep; @@ -2299,7 +2296,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); if (!ret) return true; kfree(iovec); @@ -2830,6 +2827,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = NULL; + return iov_iter_count(&iorw->iter); + } + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { buf = io_rw_buffer_select(req, &sqe_len, needs_lock); @@ -2845,14 +2849,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return ret < 0 ? ret : sqe_len; } - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); - *iovec = NULL; - return iorw->size; - } - if (req->flags & REQ_F_BUFFER_SELECT) { ret = io_iov_buffer_select(req, *iovec, needs_lock); if (!ret) { @@ -2930,21 +2926,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, + struct iovec *fast_iov, struct iov_iter *iter) { struct io_async_rw *rw = &req->io->rw; - rw->nr_segs = iter->nr_segs; - rw->size = io_size; + memcpy(&rw->iter, iter, sizeof(*iter)); if (!iovec) { - rw->iov = rw->fast_iov; - if (rw->iov != fast_iov) - memcpy(rw->iov, fast_iov, + rw->iter.iov = rw->fast_iov; + if (rw->iter.iov != fast_iov) + memcpy((void *) rw->iter.iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); } else { - rw->iov = iovec; + rw->iter.iov = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2963,9 +2957,8 @@ static int io_alloc_async_ctx(struct io_kiocb *req) return __io_alloc_async_ctx(req); } -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec, + struct iovec *fast_iov, struct iov_iter *iter) { if (!io_op_defs[req->opcode].async_ctx) return 0; @@ -2973,7 +2966,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, if (__io_alloc_async_ctx(req)) return -ENOMEM; - io_req_map_rw(req, io_size, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, fast_iov, iter); } return 0; } @@ -2981,18 +2974,19 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, static inline int io_rw_prep_async(struct io_kiocb *req, int rw, bool force_nonblock) { - struct io_async_ctx *io = req->io; - struct iov_iter iter; + struct io_async_rw *iorw = &req->io->rw; ssize_t ret; - io->rw.iov = io->rw.fast_iov; + iorw->iter.iov = iorw->fast_iov; + /* reset ->io around the iovec import, we don't want to use it */ req->io = NULL; - ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; + ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, + &iorw->iter, !force_nonblock); + req->io = container_of(iorw, struct io_async_ctx, rw); if (unlikely(ret < 0)) return ret; - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); return 0; } @@ -3090,7 +3084,8 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req) +static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, + struct iovec *fast_iov, struct iov_iter *iter) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3113,8 +3108,11 @@ static bool io_rw_should_retry(struct io_kiocb *req) * If request type doesn't require req->io to defer in general, * we need to allocate it here */ - if (!req->io && __io_alloc_async_ctx(req)) - return false; + if (!req->io) { + if (__io_alloc_async_ctx(req)) + return false; + io_req_map_rw(req, iovec, fast_iov, iter); + } ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); @@ -3141,12 +3139,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; ssize_t io_size, ret, ret2; - unsigned long nr_segs; - ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); + if (req->io) + iter = &req->io->rw.iter; + + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; io_size = ret; @@ -3160,30 +3160,26 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - iov_count = iov_iter_count(&iter); - nr_segs = iter.nr_segs; + iov_count = iov_iter_count(iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (unlikely(ret)) goto out_free; - ret2 = io_iter_do_read(req, &iter); + ret2 = io_iter_do_read(req, iter); /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2, cs); } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, - &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ iovec = NULL; /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req)) { - ret2 = io_iter_do_read(req, &iter); + if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { + ret2 = io_iter_do_read(req, iter); if (ret2 == -EIOCBQUEUED) { goto out_free; } else if (ret2 != -EAGAIN) { @@ -3223,12 +3219,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter iter; + struct iov_iter __iter, *iter = &__iter; size_t iov_count; ssize_t ret, ret2, io_size; - unsigned long nr_segs; - ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); + if (req->io) + iter = &req->io->rw.iter; + + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; io_size = ret; @@ -3247,8 +3245,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(&iter); - nr_segs = iter.nr_segs; + iov_count = iov_iter_count(iter); ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (unlikely(ret)) goto out_free; @@ -3269,9 +3266,9 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb->ki_flags |= IOCB_WRITE; if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); + ret2 = call_write_iter(req->file, kiocb, iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); else ret2 = -EINVAL; @@ -3284,11 +3281,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, cs); } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, - &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ @@ -5583,8 +5577,8 @@ static void __io_clean_op(struct io_kiocb *req) case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); + if (io->rw.iter.iov != io->rw.fast_iov) + kfree(io->rw.iter.iov); break; case IORING_OP_RECVMSG: case IORING_OP_SENDMSG: -- 2.28.0 ^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 2/2] io_uring: internally retry short reads 2020-08-13 17:56 [PATCHSET 0/2] io_uring: handle short reads internally Jens Axboe 2020-08-13 17:56 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe @ 2020-08-13 17:56 ` Jens Axboe 2020-08-13 20:25 ` [PATCHSET 0/2] io_uring: handle short reads internally Jeff Moyer 2 siblings, 0 replies; 18+ messages in thread From: Jens Axboe @ 2020-08-13 17:56 UTC (permalink / raw) To: io-uring; +Cc: david, jmoyer, Jens Axboe We've had a few application cases of not handling short reads properly, and it is understandable as short reads aren't really expected if the application isn't doing non-blocking IO. Now that we retain the iov_iter over retries, we can implement internal retry pretty trivially. This ensures that we don't return a short read, even for buffered reads on page cache conflicts. Signed-off-by: Jens Axboe <[email protected]> --- fs/io_uring.c | 72 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a20fccf91d76..c6a9f1b11e85 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -509,6 +509,7 @@ struct io_async_msghdr { struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; struct iov_iter iter; + size_t bytes_done; struct wait_page_queue wpq; }; @@ -914,7 +915,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock); static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec, - struct iovec *fast_iov, struct iov_iter *iter); + struct iovec *fast_iov, struct iov_iter *iter, + bool force); static struct kmem_cache *req_cachep; @@ -2296,7 +2298,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); if (!ret) return true; kfree(iovec); @@ -2586,6 +2588,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + /* add previously done IO, if any */ + if (req->io && req->io->rw.bytes_done > 0) { + if (ret < 0) + ret = req->io->rw.bytes_done; + else + ret += req->io->rw.bytes_done; + } + if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) @@ -2932,6 +2942,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_async_rw *rw = &req->io->rw; memcpy(&rw->iter, iter, sizeof(*iter)); + rw->bytes_done = 0; if (!iovec) { rw->iter.iov = rw->fast_iov; if (rw->iter.iov != fast_iov) @@ -2958,9 +2969,10 @@ static int io_alloc_async_ctx(struct io_kiocb *req) } static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec, - struct iovec *fast_iov, struct iov_iter *iter) + struct iovec *fast_iov, struct iov_iter *iter, + bool force) { - if (!io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { if (__io_alloc_async_ctx(req)) @@ -3084,8 +3096,7 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, - struct iovec *fast_iov, struct iov_iter *iter) +static bool io_rw_should_retry(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3094,8 +3105,8 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (req->flags & REQ_F_NOWAIT) return false; - /* already tried, or we're doing O_DIRECT */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + /* Only for buffered IO */ + if (kiocb->ki_flags & IOCB_DIRECT) return false; /* * just use poll if we can, and don't attempt if the fs doesn't @@ -3104,16 +3115,6 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - /* - * If request type doesn't require req->io to defer in general, - * we need to allocate it here - */ - if (!req->io) { - if (__io_alloc_async_ctx(req)) - return false; - io_req_map_rw(req, iovec, fast_iov, iter); - } - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); if (!ret) { @@ -3167,29 +3168,46 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret2 = io_iter_do_read(req, iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { + if (!iov_iter_count(iter)) + goto done; + + if (ret2 >= 0) { + /* successful read, but partial */ + if (req->flags & REQ_F_NOWAIT) + goto done; + io_size -= ret2; copy_iov: - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ iovec = NULL; + /* now use our persistent iterator, if we aren't already */ + iter = &req->io->rw.iter; + + if (ret2 > 0) { +retry: + req->io->rw.bytes_done += ret2; + } /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { + if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, iter); if (ret2 == -EIOCBQUEUED) { goto out_free; + } else if (ret2 == io_size) { + goto done; } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; + /* we got some bytes, but not all. retry. */ + if (ret2 > 0) + goto retry; + goto done; } } kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } +done: + kiocb_done(kiocb, ret2, cs); out_free: if (iovec) kfree(iovec); @@ -3282,7 +3300,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb_done(kiocb, ret2, cs); } else { copy_iov: - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ -- 2.28.0 ^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 17:56 [PATCHSET 0/2] io_uring: handle short reads internally Jens Axboe 2020-08-13 17:56 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe 2020-08-13 17:56 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe @ 2020-08-13 20:25 ` Jeff Moyer 2020-08-13 20:33 ` Jens Axboe 2 siblings, 1 reply; 18+ messages in thread From: Jeff Moyer @ 2020-08-13 20:25 UTC (permalink / raw) To: Jens Axboe; +Cc: io-uring, david Jens Axboe <[email protected]> writes: > Since we've had a few cases of applications not dealing with this > appopriately, I believe the safest course of action is to ensure that > we don't return short reads when we really don't have to. > > The first patch is just a prep patch that retains iov_iter state over > retries, while the second one actually enables just doing retries if > we get a short read back. Have you run this through the liburing regression tests? Tests <eeed8b54e0df-test> <timeout-overflow> <read-write> failed I'll take a look at the failures, but wanted to bring it to your attention sooner rather than later. I was using your io_uring-5.9 branch. -Jeff ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 20:25 ` [PATCHSET 0/2] io_uring: handle short reads internally Jeff Moyer @ 2020-08-13 20:33 ` Jens Axboe 2020-08-13 20:37 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-13 20:33 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/13/20 2:25 PM, Jeff Moyer wrote: > Jens Axboe <[email protected]> writes: > >> Since we've had a few cases of applications not dealing with this >> appopriately, I believe the safest course of action is to ensure that >> we don't return short reads when we really don't have to. >> >> The first patch is just a prep patch that retains iov_iter state over >> retries, while the second one actually enables just doing retries if >> we get a short read back. > > Have you run this through the liburing regression tests? > > Tests <eeed8b54e0df-test> <timeout-overflow> <read-write> failed > > I'll take a look at the failures, but wanted to bring it to your > attention sooner rather than later. I was using your io_uring-5.9 > branch. The eed8b54e0df-test failure is known with this one, pretty sure it was always racy, but I'm looking into it. The timeout-overflow test needs fixing, it's just an ordering thing with the batched completions done through submit. Not new with these patches. The read-write one I'm interested in, what did you run it on? And what was the failure? -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 20:33 ` Jens Axboe @ 2020-08-13 20:37 ` Jens Axboe 2020-08-13 20:41 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-13 20:37 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/13/20 2:33 PM, Jens Axboe wrote: > On 8/13/20 2:25 PM, Jeff Moyer wrote: >> Jens Axboe <[email protected]> writes: >> >>> Since we've had a few cases of applications not dealing with this >>> appopriately, I believe the safest course of action is to ensure that >>> we don't return short reads when we really don't have to. >>> >>> The first patch is just a prep patch that retains iov_iter state over >>> retries, while the second one actually enables just doing retries if >>> we get a short read back. >> >> Have you run this through the liburing regression tests? >> >> Tests <eeed8b54e0df-test> <timeout-overflow> <read-write> failed >> >> I'll take a look at the failures, but wanted to bring it to your >> attention sooner rather than later. I was using your io_uring-5.9 >> branch. > > The eed8b54e0df-test failure is known with this one, pretty sure it > was always racy, but I'm looking into it. > > The timeout-overflow test needs fixing, it's just an ordering thing > with the batched completions done through submit. Not new with these > patches. > > The read-write one I'm interested in, what did you run it on? And > what was the failure? BTW, what git sha did you run? -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 20:37 ` Jens Axboe @ 2020-08-13 20:41 ` Jens Axboe 2020-08-13 20:49 ` Jeff Moyer 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-13 20:41 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/13/20 2:37 PM, Jens Axboe wrote: > On 8/13/20 2:33 PM, Jens Axboe wrote: >> On 8/13/20 2:25 PM, Jeff Moyer wrote: >>> Jens Axboe <[email protected]> writes: >>> >>>> Since we've had a few cases of applications not dealing with this >>>> appopriately, I believe the safest course of action is to ensure that >>>> we don't return short reads when we really don't have to. >>>> >>>> The first patch is just a prep patch that retains iov_iter state over >>>> retries, while the second one actually enables just doing retries if >>>> we get a short read back. >>> >>> Have you run this through the liburing regression tests? >>> >>> Tests <eeed8b54e0df-test> <timeout-overflow> <read-write> failed >>> >>> I'll take a look at the failures, but wanted to bring it to your >>> attention sooner rather than later. I was using your io_uring-5.9 >>> branch. >> >> The eed8b54e0df-test failure is known with this one, pretty sure it >> was always racy, but I'm looking into it. >> >> The timeout-overflow test needs fixing, it's just an ordering thing >> with the batched completions done through submit. Not new with these >> patches. >> >> The read-write one I'm interested in, what did you run it on? And >> what was the failure? > > BTW, what git sha did you run? I do see a failure with dm on that, I'll take a look. -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 20:41 ` Jens Axboe @ 2020-08-13 20:49 ` Jeff Moyer 2020-08-13 22:08 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jeff Moyer @ 2020-08-13 20:49 UTC (permalink / raw) To: Jens Axboe; +Cc: io-uring, david Jens Axboe <[email protected]> writes: > On 8/13/20 2:37 PM, Jens Axboe wrote: >> On 8/13/20 2:33 PM, Jens Axboe wrote: >>> On 8/13/20 2:25 PM, Jeff Moyer wrote: >>>> Jens Axboe <[email protected]> writes: >>>> >>>>> Since we've had a few cases of applications not dealing with this >>>>> appopriately, I believe the safest course of action is to ensure that >>>>> we don't return short reads when we really don't have to. >>>>> >>>>> The first patch is just a prep patch that retains iov_iter state over >>>>> retries, while the second one actually enables just doing retries if >>>>> we get a short read back. >>>> >>>> Have you run this through the liburing regression tests? >>>> >>>> Tests <eeed8b54e0df-test> <timeout-overflow> <read-write> failed >>>> >>>> I'll take a look at the failures, but wanted to bring it to your >>>> attention sooner rather than later. I was using your io_uring-5.9 >>>> branch. >>> >>> The eed8b54e0df-test failure is known with this one, pretty sure it >>> was always racy, but I'm looking into it. >>> >>> The timeout-overflow test needs fixing, it's just an ordering thing >>> with the batched completions done through submit. Not new with these >>> patches. OK. >>> The read-write one I'm interested in, what did you run it on? And >>> what was the failure? >> >> BTW, what git sha did you run? > > I do see a failure with dm on that, I'll take a look. I ran it on a file system atop nvme with 8 poll queues. liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 The error I saw was: Running test read-write: Non-vectored IO not supported, skipping cqe res -22, wanted 2048 test_buf_select_short vec failed Test read-write failed with ret 1 But I don't think it was due to these two commits. Thanks, Jeff ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 20:49 ` Jeff Moyer @ 2020-08-13 22:08 ` Jens Axboe 2020-08-13 22:21 ` Jeff Moyer 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-13 22:08 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/13/20 2:49 PM, Jeff Moyer wrote: > Jens Axboe <[email protected]> writes: > >> On 8/13/20 2:37 PM, Jens Axboe wrote: >>> On 8/13/20 2:33 PM, Jens Axboe wrote: >>>> On 8/13/20 2:25 PM, Jeff Moyer wrote: >>>>> Jens Axboe <[email protected]> writes: >>>>> >>>>>> Since we've had a few cases of applications not dealing with this >>>>>> appopriately, I believe the safest course of action is to ensure that >>>>>> we don't return short reads when we really don't have to. >>>>>> >>>>>> The first patch is just a prep patch that retains iov_iter state over >>>>>> retries, while the second one actually enables just doing retries if >>>>>> we get a short read back. >>>>> >>>>> Have you run this through the liburing regression tests? >>>>> >>>>> Tests <eeed8b54e0df-test> <timeout-overflow> <read-write> failed >>>>> >>>>> I'll take a look at the failures, but wanted to bring it to your >>>>> attention sooner rather than later. I was using your io_uring-5.9 >>>>> branch. >>>> >>>> The eed8b54e0df-test failure is known with this one, pretty sure it >>>> was always racy, but I'm looking into it. >>>> >>>> The timeout-overflow test needs fixing, it's just an ordering thing >>>> with the batched completions done through submit. Not new with these >>>> patches. > > OK. > >>>> The read-write one I'm interested in, what did you run it on? And >>>> what was the failure? >>> >>> BTW, what git sha did you run? >> >> I do see a failure with dm on that, I'll take a look. > > I ran it on a file system atop nvme with 8 poll queues. > > liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 > linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 Fixed it, and actually enabled a further cleanup. > The error I saw was: > > Running test read-write: > Non-vectored IO not supported, skipping > cqe res -22, wanted 2048 > test_buf_select_short vec failed > Test read-write failed with ret 1 > > But I don't think it was due to these two commits. Yeah don't think so either. -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 22:08 ` Jens Axboe @ 2020-08-13 22:21 ` Jeff Moyer 2020-08-13 22:31 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jeff Moyer @ 2020-08-13 22:21 UTC (permalink / raw) To: Jens Axboe; +Cc: io-uring, david Jens Axboe <[email protected]> writes: >>>> BTW, what git sha did you run? >>> >>> I do see a failure with dm on that, I'll take a look. >> >> I ran it on a file system atop nvme with 8 poll queues. >> >> liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 >> linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 > > Fixed it, and actually enabled a further cleanup. Great, thanks! Did you push that out somewhere? -Jeff ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 22:21 ` Jeff Moyer @ 2020-08-13 22:31 ` Jens Axboe 2020-08-17 20:55 ` Jeff Moyer 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-13 22:31 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/13/20 4:21 PM, Jeff Moyer wrote: > Jens Axboe <[email protected]> writes: > >>>>> BTW, what git sha did you run? >>>> >>>> I do see a failure with dm on that, I'll take a look. >>> >>> I ran it on a file system atop nvme with 8 poll queues. >>> >>> liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 >>> linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 >> >> Fixed it, and actually enabled a further cleanup. > > Great, thanks! Did you push that out somewhere? It's pushed to io_uring-5.9, current sha is: ee6ac2d3d5cc50d58ca55a5967671c9c1f38b085 FWIW, the issue was just for fixed buffers. It's running through the usual testing now. -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-13 22:31 ` Jens Axboe @ 2020-08-17 20:55 ` Jeff Moyer 2020-08-17 20:57 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jeff Moyer @ 2020-08-17 20:55 UTC (permalink / raw) To: Jens Axboe; +Cc: io-uring, david Jens Axboe <[email protected]> writes: > On 8/13/20 4:21 PM, Jeff Moyer wrote: >> Jens Axboe <[email protected]> writes: >> >>>>>> BTW, what git sha did you run? >>>>> >>>>> I do see a failure with dm on that, I'll take a look. >>>> >>>> I ran it on a file system atop nvme with 8 poll queues. >>>> >>>> liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 >>>> linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 >>> >>> Fixed it, and actually enabled a further cleanup. >> >> Great, thanks! Did you push that out somewhere? > > It's pushed to io_uring-5.9, current sha is: > > ee6ac2d3d5cc50d58ca55a5967671c9c1f38b085 > > FWIW, the issue was just for fixed buffers. It's running through the > usual testing now. OK. Since it was an unrelated problem, I was expecting a separate commit for it. What was the exact issue? Is it something that needs backporting to -stable? -Jeff ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-17 20:55 ` Jeff Moyer @ 2020-08-17 20:57 ` Jens Axboe 2020-08-18 17:55 ` Jeff Moyer 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-17 20:57 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/17/20 1:55 PM, Jeff Moyer wrote: > Jens Axboe <[email protected]> writes: > >> On 8/13/20 4:21 PM, Jeff Moyer wrote: >>> Jens Axboe <[email protected]> writes: >>> >>>>>>> BTW, what git sha did you run? >>>>>> >>>>>> I do see a failure with dm on that, I'll take a look. >>>>> >>>>> I ran it on a file system atop nvme with 8 poll queues. >>>>> >>>>> liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 >>>>> linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 >>>> >>>> Fixed it, and actually enabled a further cleanup. >>> >>> Great, thanks! Did you push that out somewhere? >> >> It's pushed to io_uring-5.9, current sha is: >> >> ee6ac2d3d5cc50d58ca55a5967671c9c1f38b085 >> >> FWIW, the issue was just for fixed buffers. It's running through the >> usual testing now. > > OK. Since it was an unrelated problem, I was expecting a separate > commit for it. What was the exact issue? Is it something that needs > backporting to -stable? No, it was a bug in the posted patch, so I just folded in the fix. -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-17 20:57 ` Jens Axboe @ 2020-08-18 17:55 ` Jeff Moyer 2020-08-18 18:00 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jeff Moyer @ 2020-08-18 17:55 UTC (permalink / raw) To: Jens Axboe; +Cc: io-uring, david Jens Axboe <[email protected]> writes: > On 8/17/20 1:55 PM, Jeff Moyer wrote: >> Jens Axboe <[email protected]> writes: >> >>> On 8/13/20 4:21 PM, Jeff Moyer wrote: >>>> Jens Axboe <[email protected]> writes: >>>> >>>>>>>> BTW, what git sha did you run? >>>>>>> >>>>>>> I do see a failure with dm on that, I'll take a look. >>>>>> >>>>>> I ran it on a file system atop nvme with 8 poll queues. >>>>>> >>>>>> liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 >>>>>> linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 >>>>> >>>>> Fixed it, and actually enabled a further cleanup. >>>> >>>> Great, thanks! Did you push that out somewhere? >>> >>> It's pushed to io_uring-5.9, current sha is: >>> >>> ee6ac2d3d5cc50d58ca55a5967671c9c1f38b085 >>> >>> FWIW, the issue was just for fixed buffers. It's running through the >>> usual testing now. >> >> OK. Since it was an unrelated problem, I was expecting a separate >> commit for it. What was the exact issue? Is it something that needs >> backporting to -stable? > > No, it was a bug in the posted patch, so I just folded in the fix. We must be hitting different problems, then. I just tested your 5.7-stable branch (running the test suite from an xfs file system on an nvme partition with polling enabled), and the read-write test fails: Running test read-write: Non-vectored IO not supported, skipping cqe res -22, wanted 2048 test_buf_select_short vec failed Test read-write failed with ret 1 That's with this head: a451911d530075352fbc7ef9bb2df68145a747ad -Jeff ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-18 17:55 ` Jeff Moyer @ 2020-08-18 18:00 ` Jens Axboe 2020-08-18 18:07 ` Jeff Moyer 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-18 18:00 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/18/20 10:55 AM, Jeff Moyer wrote: > Jens Axboe <[email protected]> writes: > >> On 8/17/20 1:55 PM, Jeff Moyer wrote: >>> Jens Axboe <[email protected]> writes: >>> >>>> On 8/13/20 4:21 PM, Jeff Moyer wrote: >>>>> Jens Axboe <[email protected]> writes: >>>>> >>>>>>>>> BTW, what git sha did you run? >>>>>>>> >>>>>>>> I do see a failure with dm on that, I'll take a look. >>>>>>> >>>>>>> I ran it on a file system atop nvme with 8 poll queues. >>>>>>> >>>>>>> liburing head: 9e1d69e078ee51f253a829ff421b17cfc996d158 >>>>>>> linux-block head: ff1353802d86a9d8e40ef1377efb12a1d3000a20 >>>>>> >>>>>> Fixed it, and actually enabled a further cleanup. >>>>> >>>>> Great, thanks! Did you push that out somewhere? >>>> >>>> It's pushed to io_uring-5.9, current sha is: >>>> >>>> ee6ac2d3d5cc50d58ca55a5967671c9c1f38b085 >>>> >>>> FWIW, the issue was just for fixed buffers. It's running through the >>>> usual testing now. >>> >>> OK. Since it was an unrelated problem, I was expecting a separate >>> commit for it. What was the exact issue? Is it something that needs >>> backporting to -stable? >> >> No, it was a bug in the posted patch, so I just folded in the fix. > > We must be hitting different problems, then. I just tested your > 5.7-stable branch (running the test suite from an xfs file system on an > nvme partition with polling enabled), and the read-write test fails: > > Running test read-write: > Non-vectored IO not supported, skipping > cqe res -22, wanted 2048 > test_buf_select_short vec failed > Test read-write failed with ret 1 > > That's with this head: a451911d530075352fbc7ef9bb2df68145a747ad Not sure what this is, haven't seen that here and my regular liburing runs include both xfs-on-nvme(with poll queues) as one of the test points. Seems to me like there's two oddities in the above: 1) Saying that Non-vectored isn't supported, that is not true on 5.7. This is due to an -EINVAL return. 2) The test_buf_select_short_vec failure I'll see if I can reproduce this. Anything special otherwise enabled? Scheduler on the nvme device? nr_requests? XFS options? -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-18 18:00 ` Jens Axboe @ 2020-08-18 18:07 ` Jeff Moyer 2020-08-18 18:10 ` Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jeff Moyer @ 2020-08-18 18:07 UTC (permalink / raw) To: Jens Axboe; +Cc: io-uring, david Jens Axboe <[email protected]> writes: >> We must be hitting different problems, then. I just tested your >> 5.7-stable branch (running the test suite from an xfs file system on an >> nvme partition with polling enabled), and the read-write test fails: >> >> Running test read-write: >> Non-vectored IO not supported, skipping >> cqe res -22, wanted 2048 >> test_buf_select_short vec failed >> Test read-write failed with ret 1 >> >> That's with this head: a451911d530075352fbc7ef9bb2df68145a747ad > > Not sure what this is, haven't seen that here and my regular liburing > runs include both xfs-on-nvme(with poll queues) as one of the test > points. Seems to me like there's two oddities in the above: > > 1) Saying that Non-vectored isn't supported, that is not true on 5.7. > This is due to an -EINVAL return. > 2) The test_buf_select_short_vec failure > > I'll see if I can reproduce this. Anything special otherwise enabled? > Scheduler on the nvme device? nr_requests? XFS options? No changes from defaults. /dev/nvme0n1p1 on /mnt/test type xfs (rw,relatime,seclabel,attr2,inode64,logbufs=8,logbsize=32k,noquota) # xfs_info /dev/nvme0n1p1 meta-data=/dev/nvme0n1p1 isize=512 agcount=4, agsize=22893222 blks = sectsz=4096 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=0 = reflink=1 data = bsize=4096 blocks=91572885, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0, ftype=1 log =internal log bsize=4096 blocks=44713, version=2 = sectsz=4096 sunit=1 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 # cat /sys/block/nvme0n1/queue/scheduler [none] mq-deadline kyber bfq # cat /sys/block/nvme0n1/queue/nr_requests 1023 # cat /sys/module/nvme/parameters/poll_queues 8 I'll see if I can figure out what's going on. -Jeff ^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHSET 0/2] io_uring: handle short reads internally 2020-08-18 18:07 ` Jeff Moyer @ 2020-08-18 18:10 ` Jens Axboe 0 siblings, 0 replies; 18+ messages in thread From: Jens Axboe @ 2020-08-18 18:10 UTC (permalink / raw) To: Jeff Moyer; +Cc: io-uring, david On 8/18/20 11:07 AM, Jeff Moyer wrote: > Jens Axboe <[email protected]> writes: > >>> We must be hitting different problems, then. I just tested your >>> 5.7-stable branch (running the test suite from an xfs file system on an >>> nvme partition with polling enabled), and the read-write test fails: >>> >>> Running test read-write: >>> Non-vectored IO not supported, skipping >>> cqe res -22, wanted 2048 >>> test_buf_select_short vec failed >>> Test read-write failed with ret 1 >>> >>> That's with this head: a451911d530075352fbc7ef9bb2df68145a747ad >> >> Not sure what this is, haven't seen that here and my regular liburing >> runs include both xfs-on-nvme(with poll queues) as one of the test >> points. Seems to me like there's two oddities in the above: >> >> 1) Saying that Non-vectored isn't supported, that is not true on 5.7. >> This is due to an -EINVAL return. >> 2) The test_buf_select_short_vec failure >> >> I'll see if I can reproduce this. Anything special otherwise enabled? >> Scheduler on the nvme device? nr_requests? XFS options? > > No changes from defaults. > > /dev/nvme0n1p1 on /mnt/test type xfs (rw,relatime,seclabel,attr2,inode64,logbufs=8,logbsize=32k,noquota) > > # xfs_info /dev/nvme0n1p1 > meta-data=/dev/nvme0n1p1 isize=512 agcount=4, agsize=22893222 blks > = sectsz=4096 attr=2, projid32bit=1 > = crc=1 finobt=1, sparse=1, rmapbt=0 > = reflink=1 > data = bsize=4096 blocks=91572885, imaxpct=25 > = sunit=0 swidth=0 blks > naming =version 2 bsize=4096 ascii-ci=0, ftype=1 > log =internal log bsize=4096 blocks=44713, version=2 > = sectsz=4096 sunit=1 blks, lazy-count=1 > realtime =none extsz=4096 blocks=0, rtextents=0 > > # cat /sys/block/nvme0n1/queue/scheduler > [none] mq-deadline kyber bfq > > # cat /sys/block/nvme0n1/queue/nr_requests > 1023 > > # cat /sys/module/nvme/parameters/poll_queues > 8 > > I'll see if I can figure out what's going on. Thanks, I'll be a bit busy the next 24h, but let me know how it goes and I'll dig into this too when I can. -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCHSET v2 0/2] io_uring: handle short reads internally @ 2020-08-14 19:54 Jens Axboe 2020-08-14 19:54 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe 0 siblings, 1 reply; 18+ messages in thread From: Jens Axboe @ 2020-08-14 19:54 UTC (permalink / raw) To: io-uring; +Cc: david, jmoyer Hi, Since we've had a few cases of applications not dealing with this appopriately, I believe the safest course of action is to ensure that we don't return short reads when we really don't have to. The first patch is just a prep patch that retains iov_iter state over retries, while the second one actually enables just doing retries if we get a short read back. This passes all my testing, both liburing regression tests but also tests that explicitly trigger internal short reads and hence retry based on current state. No short reads are passed back to the application. Changes since v1: - Fixed an issue with fixed/registered buffers - Rewrite io_read() code flow to be a lot more readable - Add comments -- Jens Axboe ^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 2/2] io_uring: internally retry short reads 2020-08-14 19:54 [PATCHSET v2 " Jens Axboe @ 2020-08-14 19:54 ` Jens Axboe 0 siblings, 0 replies; 18+ messages in thread From: Jens Axboe @ 2020-08-14 19:54 UTC (permalink / raw) To: io-uring; +Cc: david, jmoyer, Jens Axboe We've had a few application cases of not handling short reads properly, and it is understandable as short reads aren't really expected if the application isn't doing non-blocking IO. Now that we retain the iov_iter over retries, we can implement internal retry pretty trivially. This ensures that we don't return a short read, even for buffered reads on page cache conflicts. Cleanup the deep nesting and hard to read nature of io_read() as well, it's much more straight forward now to read and understand. Added a few comments explaining the logic as well. Signed-off-by: Jens Axboe <[email protected]> --- fs/io_uring.c | 111 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 40 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 409cfa1d6d90..e1aacc0b9bd0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -509,6 +509,7 @@ struct io_async_msghdr { struct io_async_rw { struct iovec fast_iov[UIO_FASTIOV]; struct iov_iter iter; + size_t bytes_done; struct wait_page_queue wpq; }; @@ -914,7 +915,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock); static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec, - struct iov_iter *iter); + struct iov_iter *iter, bool force); static struct kmem_cache *req_cachep; @@ -2296,7 +2297,7 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) ret = io_import_iovec(rw, req, &iovec, &iter, false); if (ret < 0) goto end_req; - ret = io_setup_async_rw(req, iovec, &iter); + ret = io_setup_async_rw(req, iovec, &iter, false); if (!ret) return true; kfree(iovec); @@ -2586,6 +2587,14 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + /* add previously done IO, if any */ + if (req->io && req->io->rw.bytes_done > 0) { + if (ret < 0) + ret = req->io->rw.bytes_done; + else + ret += req->io->rw.bytes_done; + } + if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) @@ -2932,6 +2941,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_async_rw *rw = &req->io->rw; memcpy(&rw->iter, iter, sizeof(*iter)); + rw->bytes_done = 0; if (!iovec) { rw->iter.iov = rw->fast_iov; if (rw->fast_iov != iter->iov) @@ -2958,9 +2968,9 @@ static int io_alloc_async_ctx(struct io_kiocb *req) } static int io_setup_async_rw(struct io_kiocb *req, struct iovec *iovec, - struct iov_iter *iter) + struct iov_iter *iter, bool force) { - if (!io_op_defs[req->opcode].async_ctx) + if (!force && !io_op_defs[req->opcode].async_ctx) return 0; if (!req->io) { if (__io_alloc_async_ctx(req)) @@ -3084,8 +3094,7 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, * succeed, or in rare cases where it fails, we then fall back to using the * async worker threads for a blocking retry. */ -static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, - struct iovec *fast_iov, struct iov_iter *iter) +static bool io_rw_should_retry(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -3094,8 +3103,8 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (req->flags & REQ_F_NOWAIT) return false; - /* already tried, or we're doing O_DIRECT */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + /* Only for buffered IO */ + if (kiocb->ki_flags & IOCB_DIRECT) return false; /* * just use poll if we can, and don't attempt if the fs doesn't @@ -3104,16 +3113,6 @@ static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - /* - * If request type doesn't require req->io to defer in general, - * we need to allocate it here - */ - if (!req->io) { - if (__io_alloc_async_ctx(req)) - return false; - io_req_map_rw(req, iovec, iter); - } - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, io_async_buf_func, req); if (!ret) { @@ -3140,8 +3139,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; + ssize_t io_size, ret, ret2; size_t iov_count; - ssize_t io_size, ret, ret2 = 0; if (req->io) iter = &req->io->rw.iter; @@ -3151,6 +3150,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, return ret; io_size = ret; req->result = io_size; + ret = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -3165,31 +3165,62 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (unlikely(ret)) goto out_free; - ret2 = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { + if (!ret) { + goto done; + } else if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret == -EAGAIN) { + ret2 = io_setup_async_rw(req, iovec, iter, false); + if (ret2 < 0) + ret = ret2; + goto out_free; + } else if (ret < 0) { + goto out_free; + } + + /* read it all, or we did blocking attempt. no retry. */ + if (!iov_iter_count(iter) || !force_nonblock) + goto done; + + io_size -= ret; copy_iov: - ret = io_setup_async_rw(req, iovec, iter); - if (ret) - goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { - ret2 = io_iter_do_read(req, iter); - if (ret2 == -EIOCBQUEUED) { - goto out_free; - } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; - } - } + ret2 = io_setup_async_rw(req, iovec, iter, true); + if (ret2) { + ret = ret2; + goto out_free; + } + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* now use our persistent iterator, if we aren't already */ + iter = &req->io->rw.iter; +retry: + req->io->rw.bytes_done += ret; + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If we + * get -EIOCBQUEUED, then we'll get a notification when the desired + * page gets unlocked. We can also get a partial read here, and if we + * do, then just retry at the new offset. + */ + ret = io_iter_do_read(req, iter); + if (ret == -EIOCBQUEUED) { + ret = 0; + goto out_free; + } else if (ret > 0 && ret < io_size) { + /* we got some bytes, but not all. retry. */ + goto retry; + } +done: + kiocb_done(kiocb, ret, cs); + ret = 0; out_free: if (iovec) kfree(iovec); @@ -3282,7 +3313,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, kiocb_done(kiocb, ret2, cs); } else { copy_iov: - ret = io_setup_async_rw(req, iovec, iter); + ret = io_setup_async_rw(req, iovec, iter, false); if (ret) goto out_free; /* it's copied and will be cleaned with ->io */ -- 2.28.0 ^ permalink raw reply related [flat|nested] 18+ messages in thread
end of thread, other threads:[~2020-08-18 18:10 UTC | newest] Thread overview: 18+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2020-08-13 17:56 [PATCHSET 0/2] io_uring: handle short reads internally Jens Axboe 2020-08-13 17:56 ` [PATCH 1/2] io_uring: retain iov_iter state over io_read/io_write calls Jens Axboe 2020-08-13 17:56 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe 2020-08-13 20:25 ` [PATCHSET 0/2] io_uring: handle short reads internally Jeff Moyer 2020-08-13 20:33 ` Jens Axboe 2020-08-13 20:37 ` Jens Axboe 2020-08-13 20:41 ` Jens Axboe 2020-08-13 20:49 ` Jeff Moyer 2020-08-13 22:08 ` Jens Axboe 2020-08-13 22:21 ` Jeff Moyer 2020-08-13 22:31 ` Jens Axboe 2020-08-17 20:55 ` Jeff Moyer 2020-08-17 20:57 ` Jens Axboe 2020-08-18 17:55 ` Jeff Moyer 2020-08-18 18:00 ` Jens Axboe 2020-08-18 18:07 ` Jeff Moyer 2020-08-18 18:10 ` Jens Axboe -- strict thread matches above, loose matches on Subject: below -- 2020-08-14 19:54 [PATCHSET v2 " Jens Axboe 2020-08-14 19:54 ` [PATCH 2/2] io_uring: internally retry short reads Jens Axboe
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox